From b4d94e54d44cf30e4bb452ca5263be3473c0582d Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 20 Jun 2018 14:07:24 -0500 Subject: [PATCH] Convert x86 microkernels to assembly macros. --- frame/include/bli_avx512_macros.h | 173 - frame/include/bli_x86_asm_macros.h | 1169 +++ .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c | 4498 +++++------ kernels/knl/1m/bli_dpackm_knl_asm_24x8.c | 257 +- kernels/knl/1m/bli_spackm_knl_asm_24x16.c | 4 +- kernels/knl/3/bli_dgemm_knl_asm_24x8.c | 25 +- kernels/knl/3/bli_sgemm_knl_asm_24x16.c | 17 +- kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c | 2720 +++---- .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c | 884 +-- .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c | 856 +-- kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c | 268 +- kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c | 274 +- .../3/bli_gemm_piledriver_asm_d8x3.c | 4952 ++++++------- .../3/bli_gemm_sandybridge_asm_d8x4.c | 6552 +++++++++-------- kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 9 +- kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 9 +- kernels/zen/3/bli_gemm_zen_asm_d6x8.c | 4569 ++++++------ kernels/zen/3/bli_gemm_zen_asm_d8x6.c | 3753 +++++----- kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c | 2672 +++---- kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c | 2692 +++---- 20 files changed, 18702 insertions(+), 17651 deletions(-) delete mode 100644 frame/include/bli_avx512_macros.h create mode 100644 frame/include/bli_x86_asm_macros.h diff --git a/frame/include/bli_avx512_macros.h b/frame/include/bli_avx512_macros.h deleted file mode 100644 index 5cc45200a..000000000 --- a/frame/include/bli_avx512_macros.h +++ /dev/null @@ -1,173 +0,0 @@ -#ifndef BLIS_AVX512_MACROS_H -#define BLIS_AVX512_MACROS_H - -// -// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful -// - -#define COMMENT_BEGIN "#" -#define COMMENT_END - -#define STRINGIFY(...) #__VA_ARGS__ -#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t" -#define LABEL(label) STRINGIFY(label) ":\n\t" - -#define XMM(x) %%xmm##x -#define YMM(x) %%ymm##x -#define ZMM(x) %%zmm##x -#define EAX %%eax -#define EBX %%ebx -#define ECX %%ecx -#define EDX %%edx -#define EBP %%ebp -#define EDI %%edi -#define ESI %%esi -#define RAX %%rax -#define RBX %%rbx -#define RCX %%rcx -#define RDX %%rdx -#define RBP %%rbp -#define RDI %%rdi -#define RSI %%rsi -#define K(x) %%k##x -#define R(x) %%r##x -#define R8 %%r8 -#define R9 %%r9 -#define R10 %%r10 -#define R11 %%r11 -#define R12 %%r12 -#define R13 %%r13 -#define R14 %%r14 -#define R15 %%r15 -#define RD(x) %%r##x##d -#define R8D %%r8d -#define R9D %%r9d -#define R10D %%r10d -#define R11D %%r11d -#define R12D %%r12d -#define R13D %%r13d -#define R14D %%r14d -#define R15D %%r15d -#define IMM(x) $##x -#define VAR(x) %[x] - -#define MEM_4(reg,off,scale,disp) disp(reg,off,scale) -#define MEM_3(reg,off,scale) (reg,off,scale) -#define MEM_2(reg,disp) disp(reg) -#define MEM_1(reg) (reg) - -#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%} -#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%} -#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%} -#define MEM_1TO8_1(reg) MEM(reg) %{1to8%} - -#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%} -#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%} -#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%} -#define MEM_1TO16_1(reg) MEM(reg) %{1to16%} - -#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME -#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__) -#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__) -#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__) - -#define MASK_K(n) %{%%k##n%} -#define MASK_KZ(n) %{%%k##n%}%{z%} -#define KMOV(to,from) ASM(kmovw from, to) -#define JKNZD(kreg,label) \ - ASM(kortestw kreg, kreg) \ - ASM(jnz label) -#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0) -#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0) - -#define ALIGN16 ASM(.p2align 4) -#define ALIGN32 ASM(.p2align 5) -#define RDTSC ASM(rdstc) -#define MOV(_0, _1) ASM(mov _1, _0) -#define MOVD(_0, _1) ASM(movd _1, _0) -#define MOVL(_0, _1) ASM(movl _1, _0) -#define MOVQ(_0, _1) ASM(movq _1, _0) -#define VMOVD(_0, _1) ASM(vmovd _1, _0) -#define VMOVQ(_0, _1) ASM(vmovq _1, _0) -#define CMP(_0, _1) ASM(cmp _1, _0) -#define AND(_0, _1) ASM(and _1, _0) -#define ADD(_0, _1) ASM(add _1, _0) -#define SUB(_0, _1) ASM(sub _1, _0) -#define SAL(_0, _1) ASM(sal _1, _0) -#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0) -#define SAR(_0, _1) ASM(sar _1, _0) -#define SAL1(_0) ASM(sal _0) -#define SAR1(_0) ASM(sar _0) -#define LEA(_0, _1) ASM(lea _1, _0) -#define TEST(_0, _1) ASM(test _1, _0) -#define DEC(_0) ASM(dec _0) -#define JLE(_0) ASM(jle _0) -#define JL(_0) ASM(jl _0) -#define JNZ(_0) ASM(jnz _0) -#define JZ(_0) ASM(jz _0) -#define JNE(_0) ASM(jne _0) -#define JE(_0) ASM(je _0) -#define JNC(_0) ASM(jnc _0) -#define JC(_0) ASM(jc _0) -#define JMP(_0) ASM(jmp _0) -#define VCOMISS(_0, _1) ASM(vcomiss _1, _0) -#define VCOMISD(_0, _1) ASM(vcomisd _1, _0) -#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0) -#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0) -#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0) -#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0) -#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0) -#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0) -#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0) -#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0) -#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0) -#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0) -#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0) -#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0) -#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0) -#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0) -#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0) -#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0) -#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0) -#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0) -#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0) -#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0) -#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0) -#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0) -#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0) -#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0) -#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0) -#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0) -#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0) -#define VMOVSS(_0, _1) ASM(vmovss _1, _0) -#define VMOVSD(_0, _1) ASM(vmovsd _1, _0) -#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0) -#define VMOVUPS(_0, _1) ASM(vmovups _1, _0) -#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0) -#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0) -#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0) -#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0) -#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0) -#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0) -#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0) -#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0) -#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0) -#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0) -#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0) -#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0) -#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0) -#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0) -#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0) -#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0) -#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0) -#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0) -#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS) -#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS) -#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS) -#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS) -#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS) -#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS) -#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS) -#define VZEROUPPER() ASM(vzeroupper) - -#endif diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h new file mode 100644 index 000000000..74495b706 --- /dev/null +++ b/frame/include/bli_x86_asm_macros.h @@ -0,0 +1,1169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_X86_ASM_MACROS_H +#define BLIS_X86_ASM_MACROS_H + +// +// Assembly macros to make inline x86 with AT&T syntax somewhat less painful +// +// "Private" macros end with _ +// + +// Default syntax is Intel +#if !defined(BLIS_ASM_SYNTAX_ATT) && !defined(BLIS_ASM_SYNTAX_INTEL) +#define BLIS_ASM_SYNTAX_INTEL +#endif + +#define STRINGIFY_(...) #__VA_ARGS__ +#define GET_MACRO_(_1_,_2_,_3_,_4_,NAME,...) NAME + +#if defined(_WIN32) || defined(__MIC__) + +// Intel-style assembly blocks + +#define BEGIN_ASM __asm { +#define END_ASM(...) } + +#ifdef BLIS_ASM_SYNTAX_INTEL + +#define INSTR_4_(name,_0,_1,_2,_3) name _0,_1,_2,_3 +#define INSTR_3_(name,_0,_1,_2) name _0,_1,_2 +#define INSTR_2_(name,_0,_1) name _0,_1 +#define INSTR_1_(name,_0) name _0 +#define INSTR_0_(name) name + +#else + +#define INSTR_4_(name,_0,_1,_2,_3) name _3,_2,_1,_0 +#define INSTR_3_(name,_0,_1,_2) name _2,_1,_0 +#define INSTR_2_(name,_0,_1) name _1,_0 +#define INSTR_1_(name,_0) name _0 +#define INSTR_0_(name) name + +#endif + +#define LABEL(label) label: +#define REGISTER_(r) r +#define IMM(x) x +#define VAR(x) x +#define MASK_(x) {x} +#define JMP_(insn, target) insn target + +#define MEM_4_(reg,off,scale,disp) [reg + off*scale + disp] +#define MEM_3_(reg,off,scale) [reg + off*scale] +#define MEM_2_(reg,disp) [reg + disp] +#define MEM_1_(reg) [reg] + +#define ALIGN4 align 4 +#define ALIGN8 align 8 +#define ALIGN16 align 16 +#define ALIGN32 align 32 + +#else + +// GCC extended assembly with AT&T syntax + +#define COMMENT_BEGIN "#" +#define COMMENT_END + +#define BEGIN_ASM __asm__ volatile ( +#define END_ASM(...) __VA_ARGS__ ); + + +#ifdef BLIS_ASM_SYNTAX_ATT + +#define INSTR_4_(name,_0,_1,_2,_3) STRINGIFY_(name) " " STRINGIFY_(_0,_1,_2,_3) "\n\t" +#define INSTR_3_(name,_0,_1,_2) STRINGIFY_(name) " " STRINGIFY_(_0,_1,_2) "\n\t" +#define INSTR_2_(name,_0,_1) STRINGIFY_(name) " " STRINGIFY_(_0,_1) "\n\t" +#define INSTR_1_(name,_0) STRINGIFY_(name) " " STRINGIFY_(_0) "\n\t" +#define INSTR_0_(name) STRINGIFY_(name) "\n\t" + +#else + +#define INSTR_4_(name,_0,_1,_2,_3) STRINGIFY_(name) " " STRINGIFY_(_3,_2,_1,_0) "\n\t" +#define INSTR_3_(name,_0,_1,_2) STRINGIFY_(name) " " STRINGIFY_(_2,_1,_0) "\n\t" +#define INSTR_2_(name,_0,_1) STRINGIFY_(name) " " STRINGIFY_(_1,_0) "\n\t" +#define INSTR_1_(name,_0) STRINGIFY_(name) " " STRINGIFY_(_0) "\n\t" +#define INSTR_0_(name) STRINGIFY_(name) "\n\t" + +#endif + +#if BLIS_OS_OSX + +#define LABEL_(label) "L" STRINGIFY_(label) "%=" + +#else + +#define LABEL_(label) ".L" STRINGIFY_(label) "%=" + +#endif + +#define REGISTER_(r) %%r +#define IMM(x) $##x +#define VAR(x) %[x] +#define MASK_(x) %{x%} +#define LABEL(target) LABEL_(target) ":\n\t" +#define JMP_(insn, target) STRINGIFY_(insn) " " LABEL_(target) "\n\t" + +#define MEM_4_(reg,off,scale,disp) disp(reg,off,scale) +#define MEM_3_(reg,off,scale) (reg,off,scale) +#define MEM_2_(reg,disp) disp(reg) +#define MEM_1_(reg) (reg) + +#define ALIGN4 ".p2align 2 \n\t" +#define ALIGN8 ".p2align 3 \n\t" +#define ALIGN16 ".p2align 4 \n\t" +#define ALIGN32 ".p2align 5 \n\t" + +#endif + +#define begin_asm BEGIN_ASM +#define end_asm END_ASM + +#define label(...) LABEL(__VA_ARGS__) +#define imm(...) IMM(__VA_ARGS__) +#define var(...) VAR(__VA_ARGS__) +#define align16 ALIGN16 +#define align32 ALIGN32 + +// General-purpose registers + +#define AL REGISTER_(al) +#define AH REGISTER_(ah) +#define BL REGISTER_(bl) +#define BH REGISTER_(bh) +#define CL REGISTER_(cl) +#define CH REGISTER_(ch) +#define DL REGISTER_(dl) +#define DH REGISTER_(dh) +#define R8B REGISTER_(r8b) +#define R9B REGISTER_(r9b) +#define R10B REGISTER_(r10b) +#define R11B REGISTER_(r11b) +#define R12B REGISTER_(r12b) +#define R13B REGISTER_(r13b) +#define R14B REGISTER_(r14b) +#define R15B REGISTER_(r15b) + +#define al AL +#define ah AH +#define bl BL +#define bh BH +#define cl CL +#define ch CH +#define dl DL +#define dh DH +#define r8b R8B +#define r9b R9B +#define r10b R10B +#define r11b R11B +#define r12b R12B +#define r13b R13B +#define r14b R14B +#define r15b R15B + +#define AX REGISTER_(ax) +#define BX REGISTER_(bx) +#define CX REGISTER_(cx) +#define DX REGISTER_(dx) +#define SI REGISTER_(si) +#define DI REGISTER_(di) +#define BP REGISTER_(bp) +#define SP REGISTER_(sp) +#define R8W REGISTER_(r8w) +#define R9W REGISTER_(r9w) +#define R10W REGISTER_(r10w) +#define R11W REGISTER_(r11w) +#define R12W REGISTER_(r12w) +#define R13W REGISTER_(r13w) +#define R14W REGISTER_(r14w) +#define R15W REGISTER_(r15w) + +#define ax AX +#define bx BX +#define cx CX +#define dx DX +#define si SI +#define di DI +#define bp BP +#define sp SP +#define r8w R8W +#define r9w R9W +#define r10w R10W +#define r11w R11W +#define r12w R12W +#define r13w R13W +#define r14w R14W +#define r15w R15W + +#define EAX REGISTER_(eax) +#define EBX REGISTER_(ebx) +#define ECX REGISTER_(ecx) +#define EDX REGISTER_(edx) +#define ESP REGISTER_(esp) +#define EBP REGISTER_(ebp) +#define EDI REGISTER_(edi) +#define ESI REGISTER_(esi) +#define R8D REGISTER_(r8d) +#define R9D REGISTER_(r9d) +#define R10D REGISTER_(r10d) +#define R11D REGISTER_(r11d) +#define R12D REGISTER_(r12d) +#define R13D REGISTER_(r13d) +#define R14D REGISTER_(r14d) +#define R15D REGISTER_(r15d) + +#define eax EAX +#define ebx EBX +#define ecx ECX +#define edx EDX +#define esp ESP +#define ebp EBP +#define edi EDI +#define esi ESI +#define r8d R8D +#define r9d R9D +#define r10d R10D +#define r11d R11D +#define r12d R12D +#define r13d R13D +#define r14d R14D +#define r15d R15D + +#define RAX REGISTER_(rax) +#define RBX REGISTER_(rbx) +#define RCX REGISTER_(rcx) +#define RDX REGISTER_(rdx) +#define RSP REGISTER_(rsp) +#define RBP REGISTER_(rbp) +#define RDI REGISTER_(rdi) +#define RSI REGISTER_(rsi) +#define R8 REGISTER_(r8) +#define R9 REGISTER_(r9) +#define R10 REGISTER_(r10) +#define R11 REGISTER_(r11) +#define R12 REGISTER_(r12) +#define R13 REGISTER_(r13) +#define R14 REGISTER_(r14) +#define R15 REGISTER_(r15) + +#define rax RAX +#define rbx RBX +#define rcx RCX +#define rdx RDX +#define rsp RSP +#define rbp RBP +#define rdi RDI +#define rsi RSI +#define r8 R8 +#define r9 R9 +#define r10 R10 +#define r11 R11 +#define r12 R12 +#define r13 R13 +#define r14 R14 +#define r15 R15 + +// Vector registers + +#define XMM(x) REGISTER_(Xmm##x) +#define YMM(x) REGISTER_(Ymm##x) +#define ZMM(x) REGISTER_(Zmm##x) +#define K(x) REGISTER_(k##x) +#define MASK_K(n) MASK_(K(n)) +#define MASK_KZ(n) MASK_(K(n))MASK_(z) + +#define xmm(x) XMM(x) +#define ymm(x) YMM(x) +#define zmm(x) ZMM(x) +#define k(x) K(x) +#define mask_k(x) MASK_K(x) +#define mask_kz(x) MASK_KZ(x) + +#define XMM0 XMM(0) +#define XMM1 XMM(1) +#define XMM2 XMM(2) +#define XMM3 XMM(3) +#define XMM4 XMM(4) +#define XMM5 XMM(5) +#define XMM6 XMM(6) +#define XMM7 XMM(7) +#define XMM8 XMM(8) +#define XMM9 XMM(9) +#define XMM10 XMM(10) +#define XMM11 XMM(11) +#define XMM12 XMM(12) +#define XMM13 XMM(13) +#define XMM14 XMM(14) +#define XMM15 XMM(15) +#define XMM16 XMM(16) +#define XMM17 XMM(17) +#define XMM18 XMM(18) +#define XMM19 XMM(19) +#define XMM20 XMM(20) +#define XMM21 XMM(21) +#define XMM22 XMM(22) +#define XMM23 XMM(23) +#define XMM24 XMM(24) +#define XMM25 XMM(25) +#define XMM26 XMM(26) +#define XMM27 XMM(27) +#define XMM28 XMM(28) +#define XMM29 XMM(29) +#define XMM30 XMM(30) +#define XMM31 XMM(31) + +#define YMM0 YMM(0) +#define YMM1 YMM(1) +#define YMM2 YMM(2) +#define YMM3 YMM(3) +#define YMM4 YMM(4) +#define YMM5 YMM(5) +#define YMM6 YMM(6) +#define YMM7 YMM(7) +#define YMM8 YMM(8) +#define YMM9 YMM(9) +#define YMM10 YMM(10) +#define YMM11 YMM(11) +#define YMM12 YMM(12) +#define YMM13 YMM(13) +#define YMM14 YMM(14) +#define YMM15 YMM(15) +#define YMM16 YMM(16) +#define YMM17 YMM(17) +#define YMM18 YMM(18) +#define YMM19 YMM(19) +#define YMM20 YMM(20) +#define YMM21 YMM(21) +#define YMM22 YMM(22) +#define YMM23 YMM(23) +#define YMM24 YMM(24) +#define YMM25 YMM(25) +#define YMM26 YMM(26) +#define YMM27 YMM(27) +#define YMM28 YMM(28) +#define YMM29 YMM(29) +#define YMM30 YMM(30) +#define YMM31 YMM(31) + +#define ZMM0 ZMM(0) +#define ZMM1 ZMM(1) +#define ZMM2 ZMM(2) +#define ZMM3 ZMM(3) +#define ZMM4 ZMM(4) +#define ZMM5 ZMM(5) +#define ZMM6 ZMM(6) +#define ZMM7 ZMM(7) +#define ZMM8 ZMM(8) +#define ZMM9 ZMM(9) +#define ZMM10 ZMM(10) +#define ZMM11 ZMM(11) +#define ZMM12 ZMM(12) +#define ZMM13 ZMM(13) +#define ZMM14 ZMM(14) +#define ZMM15 ZMM(15) +#define ZMM16 ZMM(16) +#define ZMM17 ZMM(17) +#define ZMM18 ZMM(18) +#define ZMM19 ZMM(19) +#define ZMM20 ZMM(20) +#define ZMM21 ZMM(21) +#define ZMM22 ZMM(22) +#define ZMM23 ZMM(23) +#define ZMM24 ZMM(24) +#define ZMM25 ZMM(25) +#define ZMM26 ZMM(26) +#define ZMM27 ZMM(27) +#define ZMM28 ZMM(28) +#define ZMM29 ZMM(29) +#define ZMM30 ZMM(30) +#define ZMM31 ZMM(31) + +#define xmm0 xmm(0) +#define xmm1 xmm(1) +#define xmm2 xmm(2) +#define xmm3 xmm(3) +#define xmm4 xmm(4) +#define xmm5 xmm(5) +#define xmm6 xmm(6) +#define xmm7 xmm(7) +#define xmm8 xmm(8) +#define xmm9 xmm(9) +#define xmm10 xmm(10) +#define xmm11 xmm(11) +#define xmm12 xmm(12) +#define xmm13 xmm(13) +#define xmm14 xmm(14) +#define xmm15 xmm(15) +#define xmm16 xmm(16) +#define xmm17 xmm(17) +#define xmm18 xmm(18) +#define xmm19 xmm(19) +#define xmm20 xmm(20) +#define xmm21 xmm(21) +#define xmm22 xmm(22) +#define xmm23 xmm(23) +#define xmm24 xmm(24) +#define xmm25 xmm(25) +#define xmm26 xmm(26) +#define xmm27 xmm(27) +#define xmm28 xmm(28) +#define xmm29 xmm(29) +#define xmm30 xmm(30) +#define xmm31 xmm(31) + +#define ymm0 ymm(0) +#define ymm1 ymm(1) +#define ymm2 ymm(2) +#define ymm3 ymm(3) +#define ymm4 ymm(4) +#define ymm5 ymm(5) +#define ymm6 ymm(6) +#define ymm7 ymm(7) +#define ymm8 ymm(8) +#define ymm9 ymm(9) +#define ymm10 ymm(10) +#define ymm11 ymm(11) +#define ymm12 ymm(12) +#define ymm13 ymm(13) +#define ymm14 ymm(14) +#define ymm15 ymm(15) +#define ymm16 ymm(16) +#define ymm17 ymm(17) +#define ymm18 ymm(18) +#define ymm19 ymm(19) +#define ymm20 ymm(20) +#define ymm21 ymm(21) +#define ymm22 ymm(22) +#define ymm23 ymm(23) +#define ymm24 ymm(24) +#define ymm25 ymm(25) +#define ymm26 ymm(26) +#define ymm27 ymm(27) +#define ymm28 ymm(28) +#define ymm29 ymm(29) +#define ymm30 ymm(30) +#define ymm31 ymm(31) + +#define zmm0 zmm(0) +#define zmm1 zmm(1) +#define zmm2 zmm(2) +#define zmm3 zmm(3) +#define zmm4 zmm(4) +#define zmm5 zmm(5) +#define zmm6 zmm(6) +#define zmm7 zmm(7) +#define zmm8 zmm(8) +#define zmm9 zmm(9) +#define zmm10 zmm(10) +#define zmm11 zmm(11) +#define zmm12 zmm(12) +#define zmm13 zmm(13) +#define zmm14 zmm(14) +#define zmm15 zmm(15) +#define zmm16 zmm(16) +#define zmm17 zmm(17) +#define zmm18 zmm(18) +#define zmm19 zmm(19) +#define zmm20 zmm(20) +#define zmm21 zmm(21) +#define zmm22 zmm(22) +#define zmm23 zmm(23) +#define zmm24 zmm(24) +#define zmm25 zmm(25) +#define zmm26 zmm(26) +#define zmm27 zmm(27) +#define zmm28 zmm(28) +#define zmm29 zmm(29) +#define zmm30 zmm(30) +#define zmm31 zmm(31) + +// Memory access + +// MEM(rax) -> (%rax) or [rax] +// MEM(rax,0x80) -> 0x80(%rax) or [rax + 0x80] +// MEM(rax,rsi,4) -> (%rax,%rsi,4) or [rax + rsi*4] +// MEM(rax,rsi,4,0x80) -> 0x80(%rax,%rsi,4) or [rax + rsi*4 + 0x80] + +#define MEM(...) GET_MACRO_(__VA_ARGS__,MEM_4_,MEM_3_,MEM_2_,MEM_1_)(__VA_ARGS__) +#define MEM_1TO8(...) MEM(__VA_ARGS__) MASK_(1to8) +#define MEM_1TO16(...) MEM(__VA_ARGS__) MASK_(1to16) +#define MEM_BCAST(...) MEM(__VA_ARGS__) MASK_(b) + +#define mem(...) MEM(__VA_ARGS__) +#define mem_1to8(...) MEM_1TO8(__VA_ARGS__) +#define mem_1to16(...) MEM_1TO16(__VA_ARGS__) +#define mem_bcast(...) MEM_BCAST(__VA_ARGS__) + +#define VAR_1TO8(...) VAR(__VA_ARGS__) MASK_(1to8) +#define VAR_1TO16(...) VAR(__VA_ARGS__) MASK_(1to16) +#define VAR_BCAST(...) VAR(__VA_ARGS__) MASK_(b) + +#define var_1to8(...) VAR_1TO8(__VA_ARGS__) +#define var_1to16(...) VAR_1TO16(__VA_ARGS__) +#define var_bcast(...) VAR_BCAST(__VA_ARGS__) + +// Instructions + +#define INSTR_(name,...) GET_MACRO_(__VA_ARGS__,INSTR_4_,INSTR_3_,INSTR_2_, \ + INSTR_1_,INSTR_0_)(name,__VA_ARGS__) + +// Jumps + +#define JC(_0) JMP_(jc, _0) +#define JB(_0) JC(_0) +#define JNAE(_0) JC(_0) +#define JNC(_0) JMP_(jnc, _0) +#define JNB(_0) JNC(_0) +#define JAE(_0) JNC(_0) + +#define jc(_0) JC(_0) +#define jb(_0) JB(_0) +#define jnae(_0) JNAE(_0) +#define jnc(_0) JNC(_0) +#define jnb(_0) JNB(_0) +#define jae(_0) JAE(_0) + +#define JO(_0) JMP_(jo, _0) +#define JNO(_0) JMP_(jno, _0) + +#define jo(_0) JO(_0) +#define jno(_0) JNO(_0) + +#define JP(_0) JMP_(jp, _0) +#define JPE(_0) JP(_0) +#define JNP(_0) JMP_(jnp, _0) +#define JPO(_0) JNP(_0) + +#define jp(_0) JP(_0) +#define jpe(_0) JPE(_0) +#define jnp(_0) JNP(_0) +#define jpo(_0) JPO(_0) + +#define JS(_0) JMP_(js, _0) +#define JNS(_0) JMP_(jns, _0) + +#define js(_0) JS(_0) +#define jns(_0) JNS(_0) + +#define JA(_0) JMP_(ja, _0) +#define JNBE(_0) JA(_0) +#define JNA(_0) JMP_(jna, _0) +#define JBE(_0) JNA(_0) + +#define ja(_0) JA(_0) +#define jnbe(_0) JNBE(_0) +#define jna(_0) JNA(_0) +#define jbe(_0) JBE(_0) + +#define JL(_0) JMP_(jl, _0) +#define JNGE(_0) JL(_0) +#define JNL(_0) JMP_(jnl, _0) +#define JGE(_0) JNL(_0) + +#define jl(_0) JL(_0) +#define jnge(_0) JNGE(_0) +#define jnl(_0) JNL(_0) +#define jge(_0) JGE(_0) + +#define JG(_0) JMP_(jg, _0) +#define JNLE(_0) JG(_0) +#define JNG(_0) JMP_(jng, _0) +#define JLE(_0) JNG(_0) + +#define jg(_0) JG(_0) +#define jnle(_0) JNLE(_0) +#define jng(_0) JNG(_0) +#define jle(_0) JLE(_0) + +#define JE(_0) JMP_(je, _0) +#define JZ(_0) JE(_0) +#define JNE(_0) JMP_(jne, _0) +#define JNZ(_0) JNE(_0) + +#define je(_0) JE(_0) +#define jz(_0) JZ(_0) +#define jne(_0) JNE(_0) +#define jnz(_0) JNZ(_0) + +#define JMP(_0) JMP_(jmp, _0) + +#define jmp(_0) JMP(_0) + +#define SETE(_0) INSTR_(sete, _0) +#define SETZ(_0) SETE(_0) + +#define sete(_0) SETE(_0) +#define setz(_0) SETZ(_0) + +// Comparisons + +#define CMP(_0, _1) INSTR_(cmp, _0, _1) +#define TEST(_0, _1) INSTR_(test, _0, _1) + +#define cmp(_0, _1) CMP(_0, _1) +#define test(_0, _1) TEST(_0, _1) + +// Integer math + +#define AND(_0, _1) INSTR_(and, _0, _1) +#define OR(_0, _1) INSTR_(or, _0, _1) +#define XOR(_0, _1) INSTR_(xor, _0, _1) +#define ADD(_0, _1) INSTR_(add, _0, _1) +#define SUB(_0, _1) INSTR_(sub, _0, _1) +#define SAL(...) INSTR_(sal, __VA_ARGS__) +#define SAR(...) INSTR_(sar, __VA_ARGS__) +#define SHLX(_0, _1, _2) INSTR_(shlx, _0, _1, _2) +#define SHRX(_0, _1, _2) INSTR_(shrx, _0, _1, _2) +#define DEC(_0) INSTR_(dec _0) +#define INC(_0) INSTR_(inc _0) + +#define and(_0, _1) AND(_0, _1) +#define or(_0, _1) OR(_0, _1) +#define xor(_0, _1) XOR(_0, _1) +#define add(_0, _1) ADD(_0, _1) +#define sub(_0, _1) SUB(_0, _1) +#define sal(...) SAL(__VA_ARGS__) +#define sar(...) SAR(__VA_ARGS__) +#define shlx(_0, _1, _2) SHLX(_0, _1, _2) +#define shrx(_0, _1, _2) SHRX(_0, _1, _2) +#define dec(_0) DEC(_0) +#define inc(_0) INC(_0) + +// Memory access + +#define LEA(_0, _1) INSTR_(lea, _0, _1) +#define MOV(_0, _1) INSTR_(mov, _0, _1) +#define MOVD(_0, _1) INSTR_(movd, _0, _1) +#define MOVL(_0, _1) INSTR_(movl, _0, _1) +#define MOVQ(_0, _1) INSTR_(movq, _0, _1) + +#define lea(_0, _1) LEA(_0, _1) +#define mov(_0, _1) MOV(_0, _1) +#define movd(_0, _1) MOVD(_0, _1) +#define movl(_0, _1) MOVL(_0, _1) +#define movq(_0, _1) MOVQ(_0, _1) + +// Vector moves + +#define MOVSS(_0, _1) INSTR_(movss, _0, _1) +#define MOVSD(_0, _1) INSTR_(movsd, _0, _1) +#define MOVAPS(_0, _1) INSTR_(movaps, _0, _1) +#define MOVAPD(_0, _1) INSTR_(movaps, _0, _1) //use movaps because it is shorter +#define MOVDDUP(_0, _1) INSTR_(movddup, _0, _1) +#define MOVLPS(_0, _1) INSTR_(movlps, _0, _1) +#define MOVHPS(_0, _1) INSTR_(movhps, _0, _1) +#define MOVLPD(_0, _1) INSTR_(movlpd, _0, _1) +#define MOVHPD(_0, _1) INSTR_(movhpd, _0, _1) + +#define movss(_0, _1) MOVSS(_0, _1) +#define movsd(_0, _1) MOVSD(_0, _1) +#define movaps(_0, _1) MOVAPS(_0, _1) +#define movapd(_0, _1) MOVAPD(_0, _1) +#define movddup(_0, _1) MOVDDUP(_0, _1) +#define movlps(_0, _1) MOVLPS(_0, _1) +#define movhps(_0, _1) MOVHPS(_0, _1) +#define movlpd(_0, _1) MOVLPD(_0, _1) +#define movhpd(_0, _1) MOVHPD(_0, _1) + +#define VMOVDDUP(_0, _1) INSTR_(vmovddup, _0, _1) +#define VMOVSLDUP(_0, _1) INSTR_(vmovsldup, _0, _1) +#define VMOVSHDUP(_0, _1) INSTR_(vmovshdup, _0, _1) +#define VMOVD(_0, _1) INSTR_(vmovd, _0, _1) +#define VMOVQ(_0, _1) INSTR_(vmovq, _0, _1) +#define VMOVSS(_0, _1) INSTR_(vmovss, _0, _1) +#define VMOVSD(_0, _1) INSTR_(vmovsd, _0, _1) +#define VMOVAPS(_0, _1) INSTR_(vmovaps, _0, _1) +#define VMOVUPS(_0, _1) INSTR_(vmovups, _0, _1) +#define VMOVAPD(_0, _1) INSTR_(vmovapd, _0, _1) +#define VMOVUPD(_0, _1) INSTR_(vmovupd, _0, _1) +#define VMOVLPS(...) INSTR_(vmovlps, __VA_ARGS__) +#define VMOVHPS(...) INSTR_(vmovhps, __VA_ARGS__) +#define VMOVLPD(...) INSTR_(vmovlpd, __VA_ARGS__) +#define VMOVHPD(...) INSTR_(vmovhpd, __VA_ARGS__) +#define VMOVDQA(_0, _1) INSTR_(vmovdqa, _0, _1) +#define VMOVDQA32(_0, _1) INSTR_(vmovdqa32, _0, _1) +#define VMOVDQA64(_0, _1) INSTR_(vmovdqa64, _0, _1) +#define VBROADCASTSS(_0, _1) INSTR_(vbroadcastss, _0, _1) +#define VBROADCASTSD(_0, _1) INSTR_(vbroadcastsd, _0, _1) +#define VPBROADCASTD(_0, _1) INSTR_(vpbroadcastd, _0, _1) +#define VPBROADCASTQ(_0, _1) INSTR_(vpbroadcastq, _0, _1) +#define VBROADCASTF128(_0, _1) INSTR_(vbroadcastf128, _0, _1) +#define VBROADCASTF64X4(_0, _1) INSTR_(vbroadcastf64x4, _0, _1) +#define VGATHERDPS(_0, _1) INSTR_(vgatherdps, _0, _1) +#define VSCATTERDPS(_0, _1) INSTR_(vscatterdps, _0, _1) +#define VGATHERDPD(_0, _1) INSTR_(vgatherdpd, _0, _1) +#define VSCATTERDPD(_0, _1) INSTR_(vscatterdpd, _0, _1) +#define VGATHERQPS(_0, _1) INSTR_(vgatherqps, _0, _1) +#define VSCATTERQPS(_0, _1) INSTR_(vscatterqps, _0, _1) +#define VGATHERQPD(_0, _1) INSTR_(vgatherqpd, _0, _1) +#define VSCATTERQPD(_0, _1) INSTR_(vscatterqpd, _0, _1) + +#define vmovddup(_0, _1) VMOVDDUP(_0, _1) +#define vmovsldup(_0, _1) VMOVSLDUP(_0, _1) +#define vmovshdup(_0, _1) VMOVSHDUP(_0, _1) +#define vmovd(_0, _1) VMOVD(_0, _1) +#define vmovq(_0, _1) VMOVQ(_0, _1) +#define vmovss(_0, _1) VMOVSS(_0, _1) +#define vmovsd(_0, _1) VMOVSD(_0, _1) +#define vmovaps(_0, _1) VMOVAPS(_0, _1) +#define vmovups(_0, _1) VMOVUPS(_0, _1) +#define vmovapd(_0, _1) VMOVAPD(_0, _1) +#define vmovupd(_0, _1) VMOVUPD(_0, _1) +#define vmovlps(...) VMOVLPS(__VA_ARGS__) +#define vmovhps(...) VMOVHPS(__VA_ARGS__) +#define vmovlpd(...) VMOVLPD(__VA_ARGS__) +#define vmovhpd(...) VMOVHPD(__VA_ARGS__) +#define vmovdqa(_0, _1) VMOVDQA(_0, _1) +#define vmovdqa32(_0, _1) VMOVDQA32(_0, _1) +#define vmovdqa64(_0, _1) VMOVDQA64(_0, _1) +#define vbroadcastss(_0, _1) VBROADCASTSS(_0, _1) +#define vbroadcastsd(_0, _1) VBROADCASTSD(_0, _1) +#define vpbraodcastd(_0, _1) VPBROADCASTD(_0, _1) +#define vpbroadcastq(_0, _1) VPBROADCASTQ(_0, _1) +#define vbroadcastf128(_0, _1) VBROADCASTF128(_0, _1) +#define vbroadcastf64x4(_0, _1) VBROADCASTF64X4(_0, _1) +#define vgatherdps(_0, _1) VGATHERDPS(_0, _1) +#define vscatterdps(_0, _1) VSCATTERDPS(_0, _1) +#define vgatherdpd(_0, _1) VGATHERDPD(_0, _1) +#define vscatterdpd(_0, _1) VSCATTERDPD(_0, _1) +#define vgatherqps(_0, _1) VGATHERQPS(_0, _1) +#define vscatterqps(_0, _1) VSCATTERQPS(_0, _1) +#define vgatherqpd(_0, _1) VGATHERQPD(_0, _1) +#define vscatterqpd(_0, _1) VSCATTERQPD(_0, _1) + +// Vector math + +#define ADDPS(_0, _1) INSTR_(addps, _0, _1) +#define ADDPD(_0, _1) INSTR_(addpd, _0, _1) +#define SUBPS(_0, _1) INSTR_(subps, _0, _1) +#define SUBPD(_0, _1) INSTR_(subpd, _0, _1) +#define MULPS(_0, _1) INSTR_(mulps, _0, _1) +#define MULPD(_0, _1) INSTR_(mulpd, _0, _1) +#define XORPS(_0, _1) INSTR_(xorps, _0, _1) +#define XORPD(_0, _1) INSTR_(xorpd, _0, _1) +#define UCOMISS(_0, _1) INSTR_(ucomiss, _0, _1) +#define UCOMISD(_0, _1) INSTR_(ucomisd, _0, _1) +#define COMISS(_0, _1) INSTR_(comiss, _0, _1) +#define COMISD(_0, _1) INSTR_(comisd, _0, _1) + +#define addps(_0, _1) ADDPS(_0, _1) +#define addpd(_0, _1) ADDPD(_0, _1) +#define subps(_0, _1) SUBPS(_0, _1) +#define subpd(_0, _1) SUBPD(_0, _1) +#define mulps(_0, _1) MULPS(_0, _1) +#define mulpd(_0, _1) MULPD(_0, _1) +#define xorps(_0, _1) XORPS(_0, _1) +#define xorpd(_0, _1) XORPD(_0, _1) +#define ucomiss(_0, _1) UCOMISS(_0, _1) +#define ucomisd(_0, _1) UCOMISD(_0, _1) +#define cmoiss(_0, _1) COMISS(_0, _1) +#define comisd(_0, _1) COMISD(_0, _1) + +#define VADDSUBPS(_0, _1, _2) INSTR_(vaddsubps, _0, _1, _2) +#define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2) +#define VUCOMISS(_0, _1) INSTR_(vucomiss, _0, _1) +#define VUCOMISD(_0, _1) INSTR_(vucomisd, _0, _1) +#define VCOMISS(_0, _1) INSTR_(vcomiss, _0, _1) +#define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1) +#define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2) +#define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2) +#define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2) +#define VSUBPD(_0, _1, _2) INSTR_(vsubpd, _0, _1, _2) +#define VMULSS(_0, _1, _2) INSTR_(vmulss, _0, _1, _2) +#define VMULSD(_0, _1, _2) INSTR_(vmulsd, _0, _1, _2) +#define VMULPS(_0, _1, _2) INSTR_(vmulps, _0, _1, _2) +#define VMULPD(_0, _1, _2) INSTR_(vmulpd, _0, _1, _2) +#define VPMULLD(_0, _1, _2) INSTR_(vpmulld, _0, _1, _2) +#define VPMULLQ(_0, _1, _2) INSTR_(vpmullq, _0, _1, _2) +#define VPADDD(_0, _1, _2) INSTR_(vpaddd, _0, _1, _2) +#define VPSLLD(_0, _1, _2) INSTR_(vpslld, _0, _1, _2) +#define VXORPS(_0, _1, _2) INSTR_(vxorps, _0, _1, _2) +#define VXORPD(_0, _1, _2) INSTR_(vxorpd, _0, _1, _2) +#define VPXORD(_0, _1, _2) INSTR_(vpxord, _0, _1, _2) +#define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2) +#define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2) +#define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2) +#define VFMADD132SD(_0, _1, _2) INSTR_(vfmadd132sd, _0, _1, _2) +#define VFMADD213SD(_0, _1, _2) INSTR_(vfmadd213sd, _0, _1, _2) +#define VFMADD231SD(_0, _1, _2) INSTR_(vfmadd231sd, _0, _1, _2) +#define VFMADD132PS(_0, _1, _2) INSTR_(vfmadd132ps, _0, _1, _2) +#define VFMADD213PS(_0, _1, _2) INSTR_(vfmadd213ps, _0, _1, _2) +#define VFMADD231PS(_0, _1, _2) INSTR_(vfmadd231ps, _0, _1, _2) +#define VFMADD132PD(_0, _1, _2) INSTR_(vfmadd132pd, _0, _1, _2) +#define VFMADD213PD(_0, _1, _2) INSTR_(vfmadd213pd, _0, _1, _2) +#define VFMADD231PD(_0, _1, _2) INSTR_(vfmadd231pd, _0, _1, _2) +#define VFMSUB132SS(_0, _1, _2) INSTR_(vfmsub132ss, _0, _1, _2) +#define VFMSUB213SS(_0, _1, _2) INSTR_(vfmsub213ss, _0, _1, _2) +#define VFMSUB231SS(_0, _1, _2) INSTR_(vfmsub231ss, _0, _1, _2) +#define VFMSUB132SD(_0, _1, _2) INSTR_(vfmsub132sd, _0, _1, _2) +#define VFMSUB213SD(_0, _1, _2) INSTR_(vfmsub213sd, _0, _1, _2) +#define VFMSUB231SD(_0, _1, _2) INSTR_(vfmsub231sd, _0, _1, _2) +#define VFMSUB132PS(_0, _1, _2) INSTR_(vfmsub132ps, _0, _1, _2) +#define VFMSUB213PS(_0, _1, _2) INSTR_(vfmsub213ps, _0, _1, _2) +#define VFMSUB231PS(_0, _1, _2) INSTR_(vfmsub231ps, _0, _1, _2) +#define VFMSUB132PD(_0, _1, _2) INSTR_(vfmsub132pd, _0, _1, _2) +#define VFMSUB213PD(_0, _1, _2) INSTR_(vfmsub213pd, _0, _1, _2) +#define VFMSUB231PD(_0, _1, _2) INSTR_(vfmsub231pd, _0, _1, _2) +#define VFNMADD132SS(_0, _1, _2) INSTR_(vfnmadd132ss, _0, _1, _2) +#define VFNMADD213SS(_0, _1, _2) INSTR_(vfnmadd213ss, _0, _1, _2) +#define VFNMADD231SS(_0, _1, _2) INSTR_(vfnmadd231ss, _0, _1, _2) +#define VFNMADD132SD(_0, _1, _2) INSTR_(vfnmadd132sd, _0, _1, _2) +#define VFNMADD213SD(_0, _1, _2) INSTR_(vfnmadd213sd, _0, _1, _2) +#define VFNMADD231SD(_0, _1, _2) INSTR_(vfnmadd231sd, _0, _1, _2) +#define VFNMADD132PS(_0, _1, _2) INSTR_(vfnmadd132ps, _0, _1, _2) +#define VFNMADD213PS(_0, _1, _2) INSTR_(vfnmadd213ps, _0, _1, _2) +#define VFNMADD231PS(_0, _1, _2) INSTR_(vfnmadd231ps, _0, _1, _2) +#define VFNMADD132PD(_0, _1, _2) INSTR_(vfnmadd132pd, _0, _1, _2) +#define VFNMADD213PD(_0, _1, _2) INSTR_(vfnmadd213pd, _0, _1, _2) +#define VFNMADD231PD(_0, _1, _2) INSTR_(vfnmadd231pd, _0, _1, _2) +#define VFNMSUB132SS(_0, _1, _2) INSTR_(vfnmsub132ss, _0, _1, _2) +#define VFNMSUB213SS(_0, _1, _2) INSTR_(vfnmsub213ss, _0, _1, _2) +#define VFNMSUB231SS(_0, _1, _2) INSTR_(vfnmsub231ss, _0, _1, _2) +#define VFNMSUB132SD(_0, _1, _2) INSTR_(vfnmsub132sd, _0, _1, _2) +#define VFNMSUB213SD(_0, _1, _2) INSTR_(vfnmsub213sd, _0, _1, _2) +#define VFNMSUB231SD(_0, _1, _2) INSTR_(vfnmsub231sd, _0, _1, _2) +#define VFNMSUB132PS(_0, _1, _2) INSTR_(vfnmsub132ps, _0, _1, _2) +#define VFNMSUB213PS(_0, _1, _2) INSTR_(vfnmsub213ps, _0, _1, _2) +#define VFNMSUB231PS(_0, _1, _2) INSTR_(vfnmsub231ps, _0, _1, _2) +#define VFNMSUB132PD(_0, _1, _2) INSTR_(vfnmsub132pd, _0, _1, _2) +#define VFNMSUB213PD(_0, _1, _2) INSTR_(vfnmsub213pd, _0, _1, _2) +#define VFNMSUB231PD(_0, _1, _2) INSTR_(vfnmsub231pd, _0, _1, _2) +#define VFMADDSUB132SS(_0, _1, _2) INSTR_(vfmaddsub132ss, _0, _1, _2) +#define VFMADDSUB213SS(_0, _1, _2) INSTR_(vfmaddsub213ss, _0, _1, _2) +#define VFMADDSUB231SS(_0, _1, _2) INSTR_(vfmaddsub231ss, _0, _1, _2) +#define VFMADDSUB132SD(_0, _1, _2) INSTR_(vfmaddsub132sd, _0, _1, _2) +#define VFMADDSUB213SD(_0, _1, _2) INSTR_(vfmaddsub213sd, _0, _1, _2) +#define VFMADDSUB231SD(_0, _1, _2) INSTR_(vfmaddsub231sd, _0, _1, _2) +#define VFMADDSUB132PS(_0, _1, _2) INSTR_(vfmaddsub132ps, _0, _1, _2) +#define VFMADDSUB213PS(_0, _1, _2) INSTR_(vfmaddsub213ps, _0, _1, _2) +#define VFMADDSUB231PS(_0, _1, _2) INSTR_(vfmaddsub231ps, _0, _1, _2) +#define VFMADDSUB132PD(_0, _1, _2) INSTR_(vfmaddsub132pd, _0, _1, _2) +#define VFMADDSUB213PD(_0, _1, _2) INSTR_(vfmaddsub213pd, _0, _1, _2) +#define VFMADDSUB231PD(_0, _1, _2) INSTR_(vfmaddsub231pd, _0, _1, _2) +#define VFMSUBADD132SS(_0, _1, _2) INSTR_(vfmsubadd132ss, _0, _1, _2) +#define VFMSUBADD213SS(_0, _1, _2) INSTR_(vfmsubadd213ss, _0, _1, _2) +#define VFMSUBADD231SS(_0, _1, _2) INSTR_(vfmsubadd231ss, _0, _1, _2) +#define VFMSUBADD132SD(_0, _1, _2) INSTR_(vfmsubadd132sd, _0, _1, _2) +#define VFMSUBADD213SD(_0, _1, _2) INSTR_(vfmsubadd213sd, _0, _1, _2) +#define VFMSUBADD231SD(_0, _1, _2) INSTR_(vfmsubadd231sd, _0, _1, _2) +#define VFMSUBADD132PS(_0, _1, _2) INSTR_(vfmsubadd132ps, _0, _1, _2) +#define VFMSUBADD213PS(_0, _1, _2) INSTR_(vfmsubadd213ps, _0, _1, _2) +#define VFMSUBADD231PS(_0, _1, _2) INSTR_(vfmsubadd231ps, _0, _1, _2) +#define VFMSUBADD132PD(_0, _1, _2) INSTR_(vfmsubadd132pd, _0, _1, _2) +#define VFMSUBADD213PD(_0, _1, _2) INSTR_(vfmsubadd213pd, _0, _1, _2) +#define VFMSUBADD231PD(_0, _1, _2) INSTR_(vfmsubadd231pd, _0, _1, _2) +#define VFMADDSS(_0, _1, _2, _3) INSTR_(vfmaddss, _0, _1, _2, _3) +#define VFMADDSD(_0, _1, _2, _3) INSTR_(vfmaddsd, _0, _1, _2, _3) +#define VFMADDPS(_0, _1, _2, _3) INSTR_(vfmaddps, _0, _1, _2, _3) +#define VFMADDPD(_0, _1, _2, _3) INSTR_(vfmaddpd, _0, _1, _2, _3) +#define VFMSUBSS(_0, _1, _2, _3) INSTR_(vfmsubss, _0, _1, _2, _3) +#define VFMSUBSD(_0, _1, _2, _3) INSTR_(vfmsubsd, _0, _1, _2, _3) +#define VFMSUBPS(_0, _1, _2, _3) INSTR_(vfmsubps, _0, _1, _2, _3) +#define VFMSUBPD(_0, _1, _2, _3) INSTR_(vfmsubpd, _0, _1, _2, _3) +#define VFNMADDSS(_0, _1, _2, _3) INSTR_(vfnmaddss, _0, _1, _2, _3) +#define VFNMADDSD(_0, _1, _2, _3) INSTR_(vfnmaddsd, _0, _1, _2, _3) +#define VFNMADDPS(_0, _1, _2, _3) INSTR_(vfnmaddps, _0, _1, _2, _3) +#define VFNMADDPD(_0, _1, _2, _3) INSTR_(vfnmaddpd, _0, _1, _2, _3) +#define VFNMSUBSS(_0, _1, _2, _3) INSTR_(vfnmsubss, _0, _1, _2, _3) +#define VFNMSUBSD(_0, _1, _2, _3) INSTR_(vfnmsubsd, _0, _1, _2, _3) +#define VFNMSUBPS(_0, _1, _2, _3) INSTR_(vfnmsubps, _0, _1, _2, _3) +#define VFNMSUBPD(_0, _1, _2, _3) INSTR_(vfnmsubpd, _0, _1, _2, _3) +#define VFMADDSUBSS(_0, _1, _2, _3) INSTR_(vfmaddsubss, _0, _1, _2, _3) +#define VFMADDSUBSD(_0, _1, _2, _3) INSTR_(vfmaddsubsd, _0, _1, _2, _3) +#define VFMADDSUBPS(_0, _1, _2, _3) INSTR_(vfmaddsubps, _0, _1, _2, _3) +#define VFMADDSUBPD(_0, _1, _2, _3) INSTR_(vfmaddsubpd, _0, _1, _2, _3) +#define VFMSUBADDSS(_0, _1, _2, _3) INSTR_(vfmsubaddss, _0, _1, _2, _3) +#define VFMSUBADDSD(_0, _1, _2, _3) INSTR_(vfmsubaddsd, _0, _1, _2, _3) +#define VFMSUBADDPS(_0, _1, _2, _3) INSTR_(vfmsubaddps, _0, _1, _2, _3) +#define VFMSUBADDPD(_0, _1, _2, _3) INSTR_(vfmsubaddpd, _0, _1, _2, _3) +#define V4FMADDSS(_0, _1, _2) INSTR_(v4fmaddss, _0, _1, _2) +#define V4FMADDPS(_0, _1, _2) INSTR_(v4fmaddps, _0, _1, _2) +#define V4FNMADDSS(_0, _1, _2) INSTR_(v4fnmaddss, _0, _1, _2) +#define V4FNMADDPS(_0, _1, _2) INSTR_(v4fnmaddps, _0, _1, _2) + +#define vaddsubps(_0, _1, _2) VADDSUBPS(_0, _1, _2) +#define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2) +#define vucomiss(_0, _1) VUCOMISS(_0, _1) +#define vucomisd(_0, _1) VUCOMISD(_0, _1) +#define vcomiss(_0, _1) VCOMISS(_0, _1) +#define vcomisd(_0, _1) VCOMISD(_0, _1) +#define vaddps(_0, _1, _2) VADDPS(_0, _1, _2) +#define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2) +#define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2) +#define vsubpd(_0, _1, _2) VSUBPD(_0, _1, _2) +#define vmulss(_0, _1, _2) VMULSS(_0, _1, _2) +#define vmulps(_0, _1, _2) VMULPS(_0, _1, _2) +#define vmulsd(_0, _1, _2) VMULSD(_0, _1, _2) +#define vmulpd(_0, _1, _2) VMULPD(_0, _1, _2) +#define vpmulld(_0, _1, _2) VPMULLD(_0, _1, _2) +#define vpmullq(_0, _1, _2) VPMULLQ(_0, _1, _2) +#define vpaddd(_0, _1, _2) VPADDD(_0, _1, _2) +#define vpslld(_0, _1, _2) VPSLLD(_0, _1, _2) +#define vxorps(_0, _1, _2) VXORPS(_0, _1, _2) +#define vxorpd(_0, _1, _2) VXORPD(_0, _1, _2) +#define vpxord(_0, _1, _2) VPXORD(_0, _1, _2) +#define vfmadd132ss(_0, _1, _2) VFMADD132SS(_0, _1, _2) +#define vfmadd213ss(_0, _1, _2) VFMADD213SS(_0, _1, _2) +#define vfmadd231ss(_0, _1, _2) VFMADD231SS(_0, _1, _2) +#define vfmadd132sd(_0, _1, _2) VFMADD132SD(_0, _1, _2) +#define vfmadd213sd(_0, _1, _2) VFMADD213SD(_0, _1, _2) +#define vfmadd231sd(_0, _1, _2) VFMADD231SD(_0, _1, _2) +#define vfmadd132ps(_0, _1, _2) VFMADD132PS(_0, _1, _2) +#define vfmadd213ps(_0, _1, _2) VFMADD213PS(_0, _1, _2) +#define vfmadd231ps(_0, _1, _2) VFMADD231PS(_0, _1, _2) +#define vfmadd132pd(_0, _1, _2) VFMADD132PD(_0, _1, _2) +#define vfmadd213pd(_0, _1, _2) VFMADD213PD(_0, _1, _2) +#define vfmadd231pd(_0, _1, _2) VFMADD231PD(_0, _1, _2) +#define vfmadd132ss(_0, _1, _2) VFMADD132SS(_0, _1, _2) +#define vfmsub213ss(_0, _1, _2) VFMSUB213SS(_0, _1, _2) +#define vfmsub231ss(_0, _1, _2) VFMSUB231SS(_0, _1, _2) +#define vfmsub132sd(_0, _1, _2) VFMSUB132SD(_0, _1, _2) +#define vfmsub213sd(_0, _1, _2) VFMSUB213SD(_0, _1, _2) +#define vfmsub231sd(_0, _1, _2) VFMSUB231SD(_0, _1, _2) +#define vfmsub132ps(_0, _1, _2) VFMSUB132PS(_0, _1, _2) +#define vfmsub213ps(_0, _1, _2) VFMSUB213PS(_0, _1, _2) +#define vfmsub231ps(_0, _1, _2) VFMSUB231PS(_0, _1, _2) +#define vfmsub132pd(_0, _1, _2) VFMSUB132PD(_0, _1, _2) +#define vfmsub213pd(_0, _1, _2) VFMSUB213PD(_0, _1, _2) +#define vfmsub231pd(_0, _1, _2) VFMSUB231PD(_0, _1, _2) +#define vfnmadd132ss(_0, _1, _2) VFNMADD132SS(_0, _1, _2) +#define vfnmadd213ss(_0, _1, _2) VFNMADD213SS(_0, _1, _2) +#define vfnmadd231ss(_0, _1, _2) VFNMADD231SS(_0, _1, _2) +#define vfnmadd132sd(_0, _1, _2) VFNMADD132SD(_0, _1, _2) +#define vfnmadd213sd(_0, _1, _2) VFNMADD213SD(_0, _1, _2) +#define vfnmadd231sd(_0, _1, _2) VFNMADD231SD(_0, _1, _2) +#define vfnmadd132ps(_0, _1, _2) VFNMADD132PS(_0, _1, _2) +#define vfnmadd213ps(_0, _1, _2) VFNMADD213PS(_0, _1, _2) +#define vfnmadd231ps(_0, _1, _2) VFNMADD231PS(_0, _1, _2) +#define vfnmadd132pd(_0, _1, _2) VFNMADD132PD(_0, _1, _2) +#define vfnmadd213pd(_0, _1, _2) VFNMADD213PD(_0, _1, _2) +#define vfnmadd231pd(_0, _1, _2) VFNMADD231PD(_0, _1, _2) +#define vfnmadd132ss(_0, _1, _2) VFNMADD132SS(_0, _1, _2) +#define vfnmsub213ss(_0, _1, _2) VFNMSUB213SS(_0, _1, _2) +#define vfnmsub231ss(_0, _1, _2) VFNMSUB231SS(_0, _1, _2) +#define vfnmsub132sd(_0, _1, _2) VFNMSUB132SD(_0, _1, _2) +#define vfnmsub213sd(_0, _1, _2) VFNMSUB213SD(_0, _1, _2) +#define vfnmsub231sd(_0, _1, _2) VFNMSUB231SD(_0, _1, _2) +#define vfnmsub132ps(_0, _1, _2) VFNMSUB132PS(_0, _1, _2) +#define vfnmsub213ps(_0, _1, _2) VFNMSUB213PS(_0, _1, _2) +#define vfnmsub231ps(_0, _1, _2) VFNMSUB231PS(_0, _1, _2) +#define vfnmsub132pd(_0, _1, _2) VFNMSUB132PD(_0, _1, _2) +#define vfnmsub213pd(_0, _1, _2) VFNMSUB213PD(_0, _1, _2) +#define vfnmsub231pd(_0, _1, _2) VFNMSUB231PD(_0, _1, _2) +#define vfmaddsub132ss(_0, _1, _2) VFMADDSUB132SS(_0, _1, _2) +#define vfmaddsub213ss(_0, _1, _2) VFMADDSUB213SS(_0, _1, _2) +#define vfmaddsub231ss(_0, _1, _2) VFMADDSUB231SS(_0, _1, _2) +#define vfmaddsub132sd(_0, _1, _2) VFMADDSUB132SD(_0, _1, _2) +#define vfmaddsub213sd(_0, _1, _2) VFMADDSUB213SD(_0, _1, _2) +#define vfmaddsub231sd(_0, _1, _2) VFMADDSUB231SD(_0, _1, _2) +#define vfmaddsub132ps(_0, _1, _2) VFMADDSUB132PS(_0, _1, _2) +#define vfmaddsub213ps(_0, _1, _2) VFMADDSUB213PS(_0, _1, _2) +#define vfmaddsub231ps(_0, _1, _2) VFMADDSUB231PS(_0, _1, _2) +#define vfmaddsub132pd(_0, _1, _2) VFMADDSUB132PD(_0, _1, _2) +#define vfmaddsub213pd(_0, _1, _2) VFMADDSUB213PD(_0, _1, _2) +#define vfmaddsub231pd(_0, _1, _2) VFMADDSUB231PD(_0, _1, _2) +#define vfmsubadd132ss(_0, _1, _2) VFMSUBADD132SS(_0, _1, _2) +#define vfmsubadd213ss(_0, _1, _2) VFMSUBADD213SS(_0, _1, _2) +#define vfmsubadd231ss(_0, _1, _2) VFMSUBADD231SS(_0, _1, _2) +#define vfmsubadd132sd(_0, _1, _2) VFMSUBADD132SD(_0, _1, _2) +#define vfmsubadd213sd(_0, _1, _2) VFMSUBADD213SD(_0, _1, _2) +#define vfmsubadd231sd(_0, _1, _2) VFMSUBADD231SD(_0, _1, _2) +#define vfmsubadd132ps(_0, _1, _2) VFMSUBADD132PS(_0, _1, _2) +#define vfmsubadd213ps(_0, _1, _2) VFMSUBADD213PS(_0, _1, _2) +#define vfmsubadd231ps(_0, _1, _2) VFMSUBADD231PS(_0, _1, _2) +#define vfmsubadd132pd(_0, _1, _2) VFMSUBADD132PD(_0, _1, _2) +#define vfmsubadd213pd(_0, _1, _2) VFMSUBADD213PD(_0, _1, _2) +#define vfmsubadd231pd(_0, _1, _2) VFMSUBADD231PD(_0, _1, _2) +#define vfmaddss(_0, _1, _2, _3) VFMADDSS(_0, _1, _2, _3) +#define vfmaddsd(_0, _1, _2, _3) VFMADDSD(_0, _1, _2, _3) +#define vfmaddps(_0, _1, _2, _3) VFMADDPS(_0, _1, _2, _3) +#define vfmaddpd(_0, _1, _2, _3) VFMADDPD(_0, _1, _2, _3) +#define vfmsubss(_0, _1, _2, _3) VFMSUBSS(_0, _1, _2, _3) +#define vfmsubsd(_0, _1, _2, _3) VFMSUBSD(_0, _1, _2, _3) +#define vfmsubps(_0, _1, _2, _3) VFMSUBPS(_0, _1, _2, _3) +#define vfmsubpd(_0, _1, _2, _3) VFMSUBPD(_0, _1, _2, _3) +#define vfnmaddss(_0, _1, _2, _3) VFNMADDSS(_0, _1, _2, _3) +#define vfnmaddsd(_0, _1, _2, _3) VFNMADDSD(_0, _1, _2, _3) +#define vfnmaddps(_0, _1, _2, _3) VFNMADDPS(_0, _1, _2, _3) +#define vfnmaddpd(_0, _1, _2, _3) VFNMADDPD(_0, _1, _2, _3) +#define vfnmsubss(_0, _1, _2, _3) VFNMSUBSS(_0, _1, _2, _3) +#define vfnmsubsd(_0, _1, _2, _3) VFNMSUBSD(_0, _1, _2, _3) +#define vfnmsubps(_0, _1, _2, _3) VFNMSUBPS(_0, _1, _2, _3) +#define vfnmsubpd(_0, _1, _2, _3) VFNMSUBPD(_0, _1, _2, _3) +#define vfmaddsubss(_0, _1, _2, _3) VFMADDSUBSS(_0, _1, _2, _3) +#define vfmaddsubsd(_0, _1, _2, _3) VFMADDSUBSD(_0, _1, _2, _3) +#define vfmaddsubps(_0, _1, _2, _3) VFMADDSUBPS(_0, _1, _2, _3) +#define vfmaddsubpd(_0, _1, _2, _3) VFMADDSUBPD(_0, _1, _2, _3) +#define vfmsubaddss(_0, _1, _2, _3) VFMSUBADDSS(_0, _1, _2, _3) +#define vfmsubaddsd(_0, _1, _2, _3) VFMSUBADDSD(_0, _1, _2, _3) +#define vfmsubaddps(_0, _1, _2, _3) VFMSUBADDPS(_0, _1, _2, _3) +#define vfmsubaddpd(_0, _1, _2, _3) VFMSUBADDPD(_0, _1, _2, _3) +#define v4fmaddss(_0, _1, _2) V4FMADDSS(_0, _1, _2) +#define v4fmaddps(_0, _1, _2) V4FMADDPS(_0, _1, _2) +#define v4fnmaddss(_0, _1, _2) V4FNMADDSS(_0, _1, _2) +#define v4fnmaddps(_0, _1, _2) V4FNMADDPS(_0, _1, _2) + +// Vector shuffles + +#define PSHUFD(_0, _1, _2) INSTR_(pshufd, _0, _1, _2) +#define SHUFPS(_0, _1, _2) INSTR_(shufps, _0, _1, _2) +#define SHUFPD(_0, _1, _2) INSTR_(shufpd, _0, _1, _2) +#define UNPCKLPS(_0, _1) INSTR_(unpcklps, _0, _1) +#define UNPCKHPS(_0, _1) INSTR_(unpckhps, _0, _1) +#define UNPCKLPD(_0, _1) INSTR_(unpcklpd, _0, _1) +#define UNPCKHPD(_0, _1) INSTR_(unpckhpd, _0, _1) + +#define pshufd(_0, _1, _2) PSHUFD(_0, _1, _2) +#define shufps(_0, _1, _2) SHUFPS(_0, _1, _2) +#define shufpd(_0, _1, _2) SHUFPD(_0, _1, _2) +#define unpcklps(_0, _1) UNPCKLPS(_0, _1) +#define unpckhps(_0, _1) UNPCKHPS(_0, _1) +#define unpcklpd(_0, _1) UNPCKLPD(_0, _1) +#define unpckhpd(_0, _1) UNPCKHPD(_0, _1) + +#define VSHUFPS(_0, _1, _2, _3) INSTR_(vshufps, _0, _1, _2, _3) +#define VSHUFPD(_0, _1, _2, _3) INSTR_(vshufpd, _0, _1, _2, _3) +#define VPERMILPS(_0, _1, _2) INSTR_(vpermilps, _0, _1, _2) +#define VPERMILPD(_0, _1, _2) INSTR_(vpermilpd, _0, _1, _2) +#define VPERM2F128(_0, _1, _2, _3) INSTR_(vperm2f128, _0, _1, _2, _3) +#define VPERMPD(_0, _1, _2) INSTR_(vpermpd, _0, _1, _2) +#define VUNPCKLPS(_0, _1, _2) INSTR_(vunpcklps, _0, _1, _2) +#define VUNPCKHPS(_0, _1, _2) INSTR_(vunpckhps, _0, _1, _2) +#define VUNPCKLPD(_0, _1, _2) INSTR_(vunpcklpd, _0, _1, _2) +#define VUNPCKHPD(_0, _1, _2) INSTR_(vunpckhpd, _0, _1, _2) +#define VSHUFF32X4(_0, _1, _2, _3) INSTR_(vshuff32x4, _0, _1, _2, _3) +#define VSHUFF64X2(_0, _1, _2, _3) INSTR_(vshuff64x2, _0, _1, _2, _3) +#define VINSERTF128(_0, _1, _2, _3) INSTR_(vinsertf128, _0, _1, _2, _3) +#define VINSERTF32X4(_0, _1, _2, _3) INSTR_(vinsertf32x4, _0, _1, _2, _3) +#define VINSERTF32X8(_0, _1, _2, _3) INSTR_(vinsertf32x8, _0, _1, _2, _3) +#define VINSERTF64X2(_0, _1, _2, _3) INSTR_(vinsertf64x2, _0, _1, _2, _3) +#define VINSERTF64X4(_0, _1, _2, _3) INSTR_(vinsertf64x4, _0, _1, _2, _3) +#define VEXTRACTF128(_0, _1, _2) INSTR_(vextractf128, _0, _1, _2) +#define VEXTRACTF32X4(_0, _1, _2) INSTR_(vextractf32x4, _0, _1, _2) +#define VEXTRACTF32X8(_0, _1, _2) INSTR_(vextractf32x8, _0, _1, _2) +#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) +#define VEXTRACTF64X4(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) +#define VBLENDPS(_0, _1, _2, _3) INSTR_(vblendps, _0, _1, _2, _3) +#define VBLENDPD(_0, _1, _2, _3) INSTR_(vblendpd, _0, _1, _2, _3) +#define VBLENDMPS(_0, _1, _2) INSTR_(vblendmps, _0, _1, _2) +#define VBLENDMPD(_0, _1, _2) INSTR_(vblendmpd, _0, _1, _2) + +#define vshufps(_0, _1, _2, _3) VSHUFPS(_0, _1, _2, _3) +#define vshufpd(_0, _1, _2, _3) VSHUFPD(_0, _1, _2, _3) +#define vpermilps(_0, _1, _2) VPERMILPS(_0, _1, _2) +#define vpermilpd(_0, _1, _2) VPERMILPD(_0, _1, _2) +#define vperm2f128(_0, _1, _2, _3) VPERM2F128(_0, _1, _2, _3) +#define vpermpd(_0, _1, _2) VPERMPD(_0, _1, _2) +#define vunpcklps(_0, _1, _2) VUNPCKLPS(_0, _1, _2) +#define vunpckhps(_0, _1, _2) VUNPCKHPS(_0, _1, _2) +#define vunpcklpd(_0, _1, _2) VUNPCKLPD(_0, _1, _2) +#define vunpckhpd(_0, _1, _2) VUNPCKHPD(_0, _1, _2) +#define vshuff32x4(_0, _1, _2, _3) VSHUFF32x4(_0, _1, _2, _3) +#define vshuff64x2(_0, _1, _2, _3) VSHUFF64x2(_0, _1, _2, _3) +#define vinsertf128(_0, _1, _2, _3) VINSERTF128(_0, _1, _2, _3) +#define vinsertf32x4(_0, _1, _2, _3) VINSERTF32x4(_0, _1, _2, _3) +#define vinsertf32x8(_0, _1, _2, _3) VINSERTF32x8(_0, _1, _2, _3) +#define vinsertf64x2(_0, _1, _2, _3) VINSERTF64x2(_0, _1, _2, _3) +#define vinsertf64x4(_0, _1, _2, _3) VINSERTF64x4(_0, _1, _2, _3) +#define vextractf128(_0, _1, _2) VEXTRACTF128(_0, _1, _2) +#define vextractf32x4(_0, _1, _2) VEXTRACTF32x4(_0, _1, _2) +#define vextractf32x8(_0, _1, _2) VEXTRACTF32x8(_0, _1, _2) +#define vextractf64x2(_0, _1, _2) VEXTRACTF64x2(_0, _1, _2) +#define vextractf64x4(_0, _1, _2) VEXTRACTF64x4(_0, _1, _2) +#define vblendps(_0, _1, _2, _3) VBLENDPS(_0, _1, _2, _3) +#define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3) +#define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2) +#define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2) + +// Prefetches + +#define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1) +#define PREFETCHW0(_0) INSTR_(prefetchw, _0) +#define PREFETCHW1(_0) INSTR_(prefetchwt1, _0) +#define VGATHERPFDPS(_0, _1) INSTR_(vgatherpf##_0##dps, _1) +#define VSCATTERPFDPS(_0, _1) INSTR_(vscatterpf##_0##dps, _1) +#define VGATHERPFDPD(_0, _1) INSTR_(vgatherpf##_0##dpd, _1) +#define VSCATTERPFDPD(_0, _1) INSTR_(vscatterpf##_0##dpd, _1) +#define VGATHERPFQPS(_0, _1) INSTR_(vgatherpf##_0##qps, _1) +#define VSCATTERPFQPS(_0, _1) INSTR_(vscatterpf##_0##qps, _1) +#define VGATHERPFQPD(_0, _1) INSTR_(vgatherpf##_0##qpd, _1) +#define VSCATTERPFQPD(_0, _1) INSTR_(vscatterpf##_0##qpd, _1) + +#define prefetch(_0, _1) PREFETCH(_0, _1) +#define prefetchw0(_0) PREFETCHW0(_0) +#define prefetchw1(_0) PREFETCHW1(_0) +#define vgatherpfdps(_0, _1) VGATHERPFDPS(_0, _1) +#define vscatterpfdps(_0, _1) VSCATTERPFDPS(_0, _1) +#define vgatherpfdpd(_0, _1) VGATHERPFDPD(_0, _1) +#define vscatterpfdpd(_0, _1) VSCATTERPFDPD(_0, _1) +#define vgatherpfqps(_0, _1) VGATHERPFQPS(_0, _1) +#define vscatterpfqps(_0, _1) VSCATTERPFQPS(_0, _1) +#define vgatherpfqpd(_0, _1) VGATHERPFQPD(_0, _1) +#define vscatterpfqpd(_0, _1) VSCATTERPFQPD(_0, _1) + +// Mask operations + +#ifdef __MIC__ + +#define KMOVW(_0, _1) INSTR_(kmov, _0, _1) +#define JKNZD(_0, _1) INSTR_(jknzd, _0, _1) + +#else + +#define KMOVW(_0, _1) INSTR_(kmovw, _0, _1) +#define JKNZD(_0, _1) INSTR_(kortestw, _0, _0) INSTR_(jnz, _1) + +#endif + +#define KXNORW(_0, _1, _2) INSTR_(kxnorw, _0, _1, _2) +#define KSHIFTRW(_0, _1, _2) INSTR_(kshiftrw, _0, _1, _2) + +#define kmovw(_0, _1) KMOVW(_0, _1) +#define jknzd(_0, _1) JKNZD(_0, _1) +#define kxnorw(_0, _1, _2) KXNORW(_0, _1, _2) +#define kshiftrw(_0, _1, _2) KSHIFTRW(_0, _1, _2) + +// Other + +#define RDTSC() INSTR_(rdtsc) +#define VZEROALL() INSTR_(vzeroall) +#define VZEROUPPER() INSTR_(vzeroupper) + +#define rdtsc() RDTSC() +#define vzeroall() VZEROALL() +#define vzeroupper() VZEROUPPER() + +#endif diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c index a7e893ff5..059d54e64 100644 --- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c +++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c @@ -34,55 +34,58 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #define GROUP_YMM_BY_4 \ - "vmovaps %%ymm15, %%ymm7 \n\t"\ - "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t"\ - "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t"\ - " \n\t"\ - "vmovaps %%ymm11, %%ymm7 \n\t"\ - "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t"\ - "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t"\ - " \n\t"\ - "vmovaps %%ymm14, %%ymm7 \n\t"\ - "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t"\ - "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t"\ - " \n\t"\ - "vmovaps %%ymm10, %%ymm7 \n\t"\ - "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t"\ - "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t"\ - " \n\t"\ - "vmovaps %%ymm15, %%ymm7 \n\t"\ - "vperm2f128 $0x12, %%ymm15, %%ymm11, %%ymm15 \n\t"\ - "vperm2f128 $0x30, %%ymm7, %%ymm11, %%ymm11 \n\t"\ - " \n\t"\ - "vmovaps %%ymm13, %%ymm7 \n\t"\ - "vperm2f128 $0x12, %%ymm13, %%ymm9, %%ymm13 \n\t"\ - "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t"\ - " \n\t"\ - "vmovaps %%ymm14, %%ymm7 \n\t"\ - "vperm2f128 $0x12, %%ymm14, %%ymm10, %%ymm14 \n\t"\ - "vperm2f128 $0x30, %%ymm7, %%ymm10, %%ymm10 \n\t"\ - " \n\t"\ - "vmovaps %%ymm12, %%ymm7 \n\t"\ - "vperm2f128 $0x12, %%ymm12, %%ymm8, %%ymm12 \n\t"\ - "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t" + vmovaps(ymm15, ymm7)\ + vshufps(imm(0xe4), ymm13, ymm15, ymm15)\ + vshufps(imm(0xe4), ymm7, ymm13, ymm13)\ + \ + vmovaps(ymm11, ymm7)\ + vshufps(imm(0xe4), ymm9, ymm11, ymm11)\ + vshufps(imm(0xe4), ymm7, ymm9, ymm9)\ + \ + vmovaps(ymm14, ymm7)\ + vshufps(imm(0xe4), ymm12, ymm14, ymm14)\ + vshufps(imm(0xe4), ymm7, ymm12, ymm12)\ + \ + vmovaps(ymm10, ymm7)\ + vshufps(imm(0xe4), ymm8, ymm10, ymm10)\ + vshufps(imm(0xe4), ymm7, ymm8, ymm8)\ + \ + vmovaps(ymm15, ymm7)\ + vperm2f128(imm(0x12), ymm15, ymm11, ymm15)\ + vperm2f128(imm(0x30), ymm7, ymm11, ymm11)\ + \ + vmovaps(ymm13, ymm7)\ + vperm2f128(imm(0x12), ymm13, ymm9, ymm13)\ + vperm2f128(imm(0x30), ymm7, ymm9, ymm9)\ + \ + vmovaps(ymm14, ymm7)\ + vperm2f128(imm(0x12), ymm14, ymm10, ymm14)\ + vperm2f128(imm(0x30), ymm7, ymm10, ymm10)\ + \ + vmovaps(ymm12, ymm7)\ + vperm2f128(imm(0x12), ymm12, ymm8, ymm12)\ + vperm2f128(imm(0x30), ymm7, ymm8, ymm8) #define STORE_SS \ - "vextractf128 $1, %%ymm0, %%xmm2 \n\t"\ - "vmovss %%xmm0, (%%rcx) \n\t"\ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"\ - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"\ - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"\ - "vmovss %%xmm0, (%%rcx,%%r12) \n\t"\ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"\ - "vmovss %%xmm1, (%%rcx,%%r13) \n\t"\ - "vmovss %%xmm2, (%%rdx) \n\t"\ - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"\ - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"\ - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"\ - "vmovss %%xmm2, (%%rdx,%%r12) \n\t"\ - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"\ - "vmovss %%xmm3, (%%rdx,%%r13) \n\t"\ + vextractf128(imm(1), ymm0, xmm2)\ + vmovss(xmm0, mem(rcx))\ + vpermilps(imm(0x39), xmm0, xmm1)\ + vmovss(xmm1, mem(rcx, rsi, 1))\ + vpermilps(imm(0x39), xmm1, xmm0)\ + vmovss(xmm0, mem(rcx, r12, 1))\ + vpermilps(imm(0x39), xmm0, xmm1)\ + vmovss(xmm1, mem(rcx, r13, 1))\ + vmovss(xmm2, mem(rdx))\ + vpermilps(imm(0x39), xmm2, xmm3)\ + vmovss(xmm3, mem(rdx, rsi, 1))\ + vpermilps(imm(0x39), xmm3, xmm2)\ + vmovss(xmm2, mem(rdx, r12, 1))\ + vpermilps(imm(0x39), xmm2, xmm3)\ + vmovss(xmm3, mem(rdx, r13, 1))\ void bli_sgemm_bulldozer_asm_8x8_fma4 @@ -106,636 +109,636 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 __asm__ volatile ( - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) - "leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c; - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c - "prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c - "prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c - "prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c - "prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c - " \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 16 * 32(%%rax) \n\t" - "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" - "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9 \n\t" - " \n\t" - "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" - "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" - " \n\t" - " \n\t" // iteration 1 - "vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t" - "vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t" - " \n\t" - "vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t" - "vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 18 * 32(%%rax) \n\t" - "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - "addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr) - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" - "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t" - " \n\t" - "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" - "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" - " \n\t" - " \n\t" // iteration 3 - "vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t" - "addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr) - "vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t" - "vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t" - " \n\t" - "vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t" - "vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr) - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" - "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t" - " \n\t" - "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr) - "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" - "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" - "vmovaps %%ymm1, %%ymm0 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab22 ab20 ab26 ab24 - " \n\t" // ab32 ab30 ab36 ab34 - " \n\t" // ab44 ab46 ab40 ab42 - " \n\t" // ab54 ab56 ab50 ab52 - " \n\t" // ab66 ab64 ab62 ab60 - " \n\t" // ab76 ) ab74 ) ab72 ) ab70 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab23 ab21 ab27 ab25 - " \n\t" // ab33 ab31 ab37 ab35 - " \n\t" // ab45 ab47 ab41 ab43 - " \n\t" // ab55 ab57 ab51 ab53 - " \n\t" // ab67 ab65 ab63 ab61 - " \n\t" // ab77 ) ab75 ) ab73 ) ab71 ) + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. + vpermilps(imm(0x4e), ymm2, ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) + lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; + + lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, r14, 1, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(r10, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c + prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c + prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c + + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.SLOOPKITER) // MAIN LOOP + + // iteration 0 + prefetch(0, mem(rax, 16*32)) + vfmaddps(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 0*32), ymm2) + vfmaddps(ymm13, ymm0, ymm3, ymm13) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vmovaps(mem(rax, 1*32), ymm1) + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm11, ymm0, ymm4, ymm11) + vfmaddps(ymm9, ymm0, ymm5, ymm9) + + vfmaddps(ymm14, ymm0, ymm2, ymm14) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 1*32), ymm2) + vfmaddps(ymm12, ymm0, ymm3, ymm12) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm10, ymm0, ymm4, ymm10) + vfmaddps(ymm8, ymm0, ymm5, ymm8) + + // iteration 1 + vfmaddps(ymm15, ymm1, ymm2, ymm15) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 1*32), ymm2) + vfmaddps(ymm13, ymm1, ymm3, ymm13) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vmovaps(mem(rax, 2*32), ymm0) + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm11, ymm1, ymm4, ymm11) + vfmaddps(ymm9, ymm1, ymm5, ymm9) + + vfmaddps(ymm14, ymm1, ymm2, ymm14) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 2*32), ymm2) + vfmaddps(ymm12, ymm1, ymm3, ymm12) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm10, ymm1, ymm4, ymm10) + vfmaddps(ymm8, ymm1, ymm5, ymm8) + + // iteration 2 + prefetch(0, mem(rax, 18*32)) + vfmaddps(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 2*32), ymm2) + vfmaddps(ymm13, ymm0, ymm3, ymm13) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vmovaps(mem(rax, 3*32), ymm1) + add(imm(4*8*4), rax) // a += 4*8 (unroll x mr) + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm11, ymm0, ymm4, ymm11) + vfmaddps(ymm9, ymm0, ymm5, ymm9) + + vfmaddps(ymm14, ymm0, ymm2, ymm14) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 3*32), ymm2) + vfmaddps(ymm12, ymm0, ymm3, ymm12) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm10, ymm0, ymm4, ymm10) + vfmaddps(ymm8, ymm0, ymm5, ymm8) + + // iteration 3 + vfmaddps(ymm15, ymm1, ymm2, ymm15) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 3*32), ymm2) + add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr) + vfmaddps(ymm13, ymm1, ymm3, ymm13) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vmovaps(mem(rax, 0*32), ymm0) + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm11, ymm1, ymm4, ymm11) + vfmaddps(ymm9, ymm1, ymm5, ymm9) + + vfmaddps(ymm14, ymm1, ymm2, ymm14) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 0*32), ymm2) + vfmaddps(ymm12, ymm1, ymm3, ymm12) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm10, ymm1, ymm4, ymm10) + vfmaddps(ymm8, ymm1, ymm5, ymm8) + + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 16*32)) + vfmaddps(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 0*32), ymm2) + vfmaddps(ymm13, ymm0, ymm3, ymm13) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + vmovaps(mem(rax, 1*32), ymm1) + add(imm(8*1*4), rax) // a += 8 (1 x mr) + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm11, ymm0, ymm4, ymm11) + vfmaddps(ymm9, ymm0, ymm5, ymm9) + + vfmaddps(ymm14, ymm0, ymm2, ymm14) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 1*32), ymm2) + add(imm(8*1*4), rbx) // b += 8 (1 x nr) + vfmaddps(ymm12, ymm0, ymm3, ymm12) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + vpermilps(imm(0x4e), ymm2, ymm3) + vfmaddps(ymm10, ymm0, ymm4, ymm10) + vfmaddps(ymm8, ymm0, ymm5, ymm8) + vmovaps(ymm1, ymm0) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + label(.SPOSTACCUM) + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab22 ab20 ab26 ab24 + // ab32 ab30 ab36 ab34 + // ab44 ab46 ab40 ab42 + // ab54 ab56 ab50 ab52 + // ab66 ab64 ab62 ab60 + // ab76 ) ab74 ) ab72 ) ab70 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab23 ab21 ab27 ab25 + // ab33 ab31 ab37 ab35 + // ab45 ab47 ab41 ab43 + // ab55 ab57 ab51 ab53 + // ab67 ab65 ab63 ab61 + // ab77 ) ab75 ) ab73 ) ab71 ) GROUP_YMM_BY_4 - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab20 ab22 ab24 ab26 - " \n\t" // ab30 ab32 ab34 ab36 - " \n\t" // ab44 ab46 ab40 ab42 - " \n\t" // ab54 ab56 ab50 ab52 - " \n\t" // ab64 ab66 ab60 ab62 - " \n\t" // ab74 ) ab76 ) ab70 ) ab72 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab21 ab23 ab25 ab27 - " \n\t" // ab31 ab33 ab35 ab37 - " \n\t" // ab45 ab47 ab41 ab43 - " \n\t" // ab55 ab57 ab51 ab53 - " \n\t" // ab65 ab67 ab61 ab63 - " \n\t" // ab75 ) ab77 ) ab71 ) ab73 ) - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab20 ab22 ab24 ab26 - " \n\t" // ab30 ab32 ab34 ab36 - " \n\t" // ab40 ab42 ab44 ab46 - " \n\t" // ab50 ab52 ab54 ab56 - " \n\t" // ab60 ab62 ab64 ab66 - " \n\t" // ab70 ) ab72 ) ab74 ) ab76 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab21 ab23 ab25 ab27 - " \n\t" // ab31 ab33 ab35 ab37 - " \n\t" // ab41 ab43 ab45 ab47 - " \n\t" // ab51 ab53 ab55 ab57 - " \n\t" // ab61 ab63 ab65 ab67 - " \n\t" // ab71 ) ab73 ) ab75 ) ab77 ) - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate - " \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 4*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" // update c00:c70 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vfmaddps %%ymm15, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab20 ab22 ab24 ab26 + // ab30 ab32 ab34 ab36 + // ab44 ab46 ab40 ab42 + // ab54 ab56 ab50 ab52 + // ab64 ab66 ab60 ab62 + // ab74 ) ab76 ) ab70 ) ab72 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab21 ab23 ab25 ab27 + // ab31 ab33 ab35 ab37 + // ab45 ab47 ab41 ab43 + // ab55 ab57 ab51 ab53 + // ab65 ab67 ab61 ab63 + // ab75 ) ab77 ) ab71 ) ab73 ) + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab20 ab22 ab24 ab26 + // ab30 ab32 ab34 ab36 + // ab40 ab42 ab44 ab46 + // ab50 ab52 ab54 ab56 + // ab60 ab62 ab64 ab66 + // ab70 ) ab72 ) ab74 ) ab76 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab21 ab23 ab25 ab27 + // ab31 ab33 ab35 ab37 + // ab41 ab43 ab45 ab47 + // ab51 ab53 ab55 ab57 + // ab61 ab63 ab65 ab67 + // ab71 ) ab73 ) ab75 ) ab77 ) + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm4) // load beta and duplicate + + vmulps(ymm0, ymm8, ymm8) // scale by alpha + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; + + + // determine if + // c % 32 == 0, AND + // 4*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm4) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORED) // jump to column storage case + + + label(.SGENSTORED) + // update c00:c70 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm14, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm14, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm14, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c02:c72 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm13, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c02:c72 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm13, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c03:c73 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm12, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c03:c73 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm12, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm12, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c04:c74 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c04:c74 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm11, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c05:c75 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c05:c75 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm10, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm10, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c06:c76 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm9, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c06:c76 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm9, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c07:c77 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm8, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - " \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c07:c77 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm8, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm8, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + STORE_SS - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70, -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm15, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71, -// "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, -// "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vfmaddps %%ymm14, %%ymm1, %%ymm4, %%ymm1\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72, -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm13, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73, -// "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, -// "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vfmaddps %%ymm12, %%ymm1, %%ymm4, %%ymm1\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74, -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75, -// "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, -// "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm1\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76, -// "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, -// "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vfmaddps %%ymm9, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77, -// "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, -// "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vfmaddps %%ymm8, %%ymm1, %%ymm4, %%ymm1\n\t" // scale by beta and add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" // update c00:c70 - "vmovapd %%ymm15, %%ymm0 \n\t" + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + + vmovaps(mem(rcx), ymm0) // load c00:c70, +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm15, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c01:c71, +// vmulps(ymm4, ymm1, ymm1) // scale by beta, +// vaddps(ymm14, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm14, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c02:c72, +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm13, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c03:c73, +// vmulps(ymm4, ymm1, ymm1) // scale by beta, +// vaddps(ymm12, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm12, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c04:c74, +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm11, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c05:c75, +// vmulps(ymm4, ymm1, ymm1) // scale by beta, +// vaddps(ymm10, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm10, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c06:c76, +// vmulps(ymm4, ymm0, ymm0) // scale by beta, +// vaddps(ymm9, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c07:c77, +// vmulps(ymm4, ymm1, ymm1) // scale by beta, +// vaddps(ymm8, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm8, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + + jmp(.SDONE) // jump to end. + + + label(.SBETAZERO) + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORBZ) // jump to column storage case + + + label(.SGENSTORBZ) + // update c00:c70 + vmovapd(ymm15, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - "vmovapd %%ymm14, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + vmovapd(ymm14, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - "vmovapd %%ymm13, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + vmovapd(ymm13, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - "vmovapd %%ymm12, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + vmovapd(ymm12, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c04:c74 - "vmovapd %%ymm11, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c04:c74 + vmovapd(ymm11, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c05:c75 - "vmovapd %%ymm10, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c05:c75 + vmovapd(ymm10, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c06:c76 - "vmovapd %%ymm9, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c06:c76 + vmovapd(ymm9, ymm0) STORE_SS - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c07:c77 - "vmovapd %%ymm8, %%ymm0 \n\t" + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + // update c07:c77 + vmovapd(ymm8, ymm0) STORE_SS - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory. - " \n\t" - ".SDONE: \n\t" - " \n\t" + + jmp(.SDONE) // jump to end. + + + label(.SCOLSTORBZ) + + vmovaps(ymm15, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm14, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm13, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm12, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm11, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm8, mem(rcx)) // and store back to memory. + + label(.SDONE) + : // output operands (none) : // input operands @@ -767,95 +770,95 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 #undef KERNEL4x6_4 #define KERNEL4x6_1(xx) \ - ".p2align 2 \n\t"\ - "vmovddup -8 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ - "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ - "vfmaddpd %%xmm6, %%xmm3, %%xmm0, %%xmm6 \n\t"\ - "vmovddup -7 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm7, %%xmm1, %%xmm0, %%xmm7 \n\t"\ - "prefetcht0 128(%%rax) \n\t"\ - "vfmaddpd %%xmm8, %%xmm2, %%xmm0, %%xmm8 \n\t"\ - "vfmaddpd %%xmm9, %%xmm3, %%xmm0, %%xmm9 \n\t"\ - "vmovddup -6 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm10, %%xmm1, %%xmm0, %%xmm10 \n\t"\ - "vfmaddpd %%xmm11, %%xmm2, %%xmm0, %%xmm11 \n\t"\ - "vfmaddpd %%xmm12, %%xmm3, %%xmm0, %%xmm12 \n\t"\ - "vmovddup -5 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm13, %%xmm1, %%xmm0, %%xmm13 \n\t"\ - "vmovaps -6 * 8(%%rbx), %%xmm1 \n\t"\ - "vfmaddpd %%xmm14, %%xmm2, %%xmm0, %%xmm14 \n\t"\ - "vmovaps -4 * 8(%%rbx), %%xmm2 \n\t"\ - "vfmaddpd %%xmm15, %%xmm3, %%xmm0, %%xmm15 \n\t"\ - "vmovaps -2 * 8(%%rbx), %%xmm3 \n\t" + ALIGN4\ + vmovddup(mem(rax, -8*8), xmm0)\ + vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ + vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ + vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ + vmovddup(mem(rax, -7*8), xmm0)\ + vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ + prefetch(0, mem(rax, 128))\ + vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ + vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ + vmovddup(mem(rax, -6*8), xmm0)\ + vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ + vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ + vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ + vmovddup(mem(rax, -5*8), xmm0)\ + vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ + vmovaps(mem(rbx, -6*8), xmm1)\ + vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ + vmovaps(mem(rbx, -4*8), xmm2)\ + vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ + vmovaps(mem(rbx, -2*8), xmm3) #define KERNEL4x6_2(xx) \ - "vmovddup -4 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ - "prefetcht0 192(%%rax) \n\t"\ - "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ - "vfmaddpd %%xmm6, %%xmm3, %%xmm0, %%xmm6 \n\t"\ - "vmovddup -3 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm7, %%xmm1, %%xmm0, %%xmm7 \n\t"\ - "vfmaddpd %%xmm8, %%xmm2, %%xmm0, %%xmm8 \n\t"\ - "vfmaddpd %%xmm9, %%xmm3, %%xmm0, %%xmm9 \n\t"\ - "vmovddup -2 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm10, %%xmm1, %%xmm0, %%xmm10 \n\t"\ - "vfmaddpd %%xmm11, %%xmm2, %%xmm0, %%xmm11 \n\t"\ - "vfmaddpd %%xmm12, %%xmm3, %%xmm0, %%xmm12 \n\t"\ - "vmovddup -1 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm13, %%xmm1, %%xmm0, %%xmm13 \n\t"\ - "vmovaps 0 * 8(%%rbx), %%xmm1 \n\t"\ - "vfmaddpd %%xmm14, %%xmm2, %%xmm0, %%xmm14 \n\t"\ - "vmovaps 2 * 8(%%rbx), %%xmm2 \n\t"\ - "vfmaddpd %%xmm15, %%xmm3, %%xmm0, %%xmm15 \n\t"\ - "vmovaps 4 * 8(%%rbx), %%xmm3 \n\t"\ + vmovddup(mem(rax, -4*8), xmm0)\ + vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ + prefetch(0, mem(rax, 192))\ + vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ + vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ + vmovddup(mem(rax, -3*8), xmm0)\ + vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ + vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ + vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ + vmovddup(mem(rax, -2*8), xmm0)\ + vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ + vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ + vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ + vmovddup(mem(rax, -1*8), xmm0)\ + vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ + vmovaps(mem(rbx, 0*8), xmm1)\ + vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ + vmovaps(mem(rbx, 2*8), xmm2)\ + vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ + vmovaps(mem(rbx, 4*8), xmm3)\ #define KERNEL4x6_3(xx) \ - "vmovddup 0 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ - "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ - "vfmaddpd %%xmm6, %%xmm3, %%xmm0, %%xmm6 \n\t"\ - "vmovddup 1 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm7, %%xmm1, %%xmm0, %%xmm7 \n\t"\ - "prefetcht0 224(%%rax) \n\t"\ - "vfmaddpd %%xmm8, %%xmm2, %%xmm0, %%xmm8 \n\t"\ - "vfmaddpd %%xmm9, %%xmm3, %%xmm0, %%xmm9 \n\t"\ - "vmovddup 2 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm10, %%xmm1, %%xmm0, %%xmm10 \n\t"\ - "vfmaddpd %%xmm11, %%xmm2, %%xmm0, %%xmm11 \n\t"\ - "vfmaddpd %%xmm12, %%xmm3, %%xmm0, %%xmm12 \n\t"\ - "vmovddup 3 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm13, %%xmm1, %%xmm0, %%xmm13 \n\t"\ - "vmovaps 6 * 8(%%rbx), %%xmm1 \n\t"\ - "vfmaddpd %%xmm14, %%xmm2, %%xmm0, %%xmm14 \n\t"\ - "vmovaps 8 * 8(%%rbx), %%xmm2 \n\t"\ - "vfmaddpd %%xmm15, %%xmm3, %%xmm0, %%xmm15 \n\t"\ - "vmovaps 10 * 8(%%rbx), %%xmm3 \n\t" + vmovddup(mem(rax, 0*8), xmm0)\ + vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ + vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ + vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ + vmovddup(mem(rax, 1*8), xmm0)\ + vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ + prefetch(0, mem(rax, 224))\ + vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ + vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ + vmovddup(mem(rax, 2*8), xmm0)\ + vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ + vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ + vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ + vmovddup(mem(rax, 3*8), xmm0)\ + vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ + vmovaps(mem(rbx, 6*8), xmm1)\ + vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ + vmovaps(mem(rbx, 8*8), xmm2)\ + vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ + vmovaps(mem(rbx, 10*8), xmm3) #define KERNEL4x6_4(xx) \ - "vmovddup 4 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ - "prefetcht0 224(%%rax) \n\t"\ - "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ - "vfmaddpd %%xmm6, %%xmm3, %%xmm0, %%xmm6 \n\t"\ - "vmovddup 5 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm7, %%xmm1, %%xmm0, %%xmm7 \n\t"\ - "vfmaddpd %%xmm8, %%xmm2, %%xmm0, %%xmm8 \n\t"\ - "vfmaddpd %%xmm9, %%xmm3, %%xmm0, %%xmm9 \n\t"\ - "vmovddup 6 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm10, %%xmm1, %%xmm0, %%xmm10 \n\t"\ - "vfmaddpd %%xmm11, %%xmm2, %%xmm0, %%xmm11 \n\t"\ - "vfmaddpd %%xmm12, %%xmm3, %%xmm0, %%xmm12 \n\t"\ - "vmovddup 7 * 8(%%rax), %%xmm0 \n\t"\ - "vfmaddpd %%xmm13, %%xmm1, %%xmm0, %%xmm13 \n\t"\ - "vmovaps 12 * 8(%%rbx), %%xmm1 \n\t"\ - "vfmaddpd %%xmm14, %%xmm2, %%xmm0, %%xmm14 \n\t"\ - "vmovaps 14 * 8(%%rbx), %%xmm2 \n\t"\ - "vfmaddpd %%xmm15, %%xmm3, %%xmm0, %%xmm15 \n\t"\ - "addq $16*8, %%rax \n\t"\ - "vmovaps 16 * 8(%%rbx), %%xmm3 \n\t"\ - "addq $24*8, %%rbx \n\t" + vmovddup(mem(rax, 4*8), xmm0)\ + vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ + prefetch(0, mem(rax, 224))\ + vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ + vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ + vmovddup(mem(rax, 5*8), xmm0)\ + vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ + vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ + vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ + vmovddup(mem(rax, 6*8), xmm0)\ + vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ + vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ + vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ + vmovddup(mem(rax, 7*8), xmm0)\ + vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ + vmovaps(mem(rbx, 12*8), xmm1)\ + vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ + vmovaps(mem(rbx, 14*8), xmm2)\ + vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ + add(imm(16*8), rax)\ + vmovaps(mem(rbx, 16*8), xmm3)\ + add(imm(24*8), rbx) void bli_dgemm_bulldozer_asm_4x6_fma4 ( @@ -878,27 +881,27 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 __asm__ ( - " \n\t" - " \n\t" - "vzeroall \n\t" - "movq %3, %%rbx \n\t" // load address of b. - "movq %2, %%rax \n\t" // load address of a. - "prefetcht0 64(%%rax) \n\t" - " \n\t" - " \n\t" - "vmovaps 0 * 8(%%rbx), %%xmm1 \n\t" - "vmovaps 2 * 8(%%rbx), %%xmm2 \n\t" - "vmovaps 4 * 8(%%rbx), %%xmm3 \n\t" - "addq $12*8, %%rbx \n\t" - "addq $8*8, %%rax \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; notice %0 not $0 - "testq %%rsi, %%rsi \n\t" - "je .CONSIDERKLEFT \n\t" - " \n\t" - ".p2align 5 \n\t" - ".LOOPKITER: \n\t" // MAIN LOOP - " \n\t" + + + vzeroall() + mov(%3, rbx) // load address of b. + mov(%2, rax) // load address of a. + prefetch(0, mem(rax, 64)) + + + vmovaps(mem(rbx, 0*8), xmm1) + vmovaps(mem(rbx, 2*8), xmm2) + vmovaps(mem(rbx, 4*8), xmm3) + add(imm(12*8), rbx) + add(imm(8*8), rax) + + mov(%0, rsi) // i = k_iter; notice %0 not $0 + test(rsi, rsi) + je(.CONSIDERKLEFT) + + ALIGN32 + label(.LOOPKITER) // MAIN LOOP + KERNEL4x6_1(xx) KERNEL4x6_2(xx) KERNEL4x6_3(xx) @@ -911,125 +914,125 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 KERNEL4x6_2(xx) KERNEL4x6_3(xx) KERNEL4x6_4(xx) - " \n\t" - "decq %%rsi \n\t" - "jne .LOOPKITER \n\t" - " \n\t" - ".CONSIDERKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" - "testq %%rsi, %%rsi \n\t" - ".LOOPKLEFT: \n\t" - "je .POSTACCUM \n\t" - " \n\t" + + dec(rsi) + jne(.LOOPKITER) + + label(.CONSIDERKLEFT) + + mov(%1, rsi) + test(rsi, rsi) + label(.LOOPKLEFT) + je(.POSTACCUM) + KERNEL4x6_1(xx) - "addq $6*8, %%rbx \n\t" - "addq $4*8, %%rax \n\t" - " \n\t" - "decq %%rsi \n\t" - "jmp .LOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - ".POSTACCUM: \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load cs_c - "movq %8, %%rdi \n\t" // load rs_c - "vmovddup (%4), %%xmm2 \n\t" //load alpha - "vmovddup (%5), %%xmm3 \n\t" //load beta - "movq %6, %%rcx \n\t" // load address of c - "salq $3, %%rsi \n\t" // cs_c *= sizeof(double) - "salq $3, %%rdi \n\t" // rs_c *= sizeof(double) - "leaq (%%rcx, %%rdi,2), %%rdx \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%rcx,%%rdi), %%xmm0, %%xmm0 \n\t" - "vmovhpd (%%rdx,%%rdi), %%xmm1, %%xmm1 \n\t" - "leaq (%%rdx, %%rdi,2), %%r8 \n\t" - "vmulpd %%xmm2, %%xmm4, %%xmm4 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm5, %%xmm5 \n\t" // scale by alpha, - "vfmaddpd %%xmm4, %%xmm0, %%xmm3, %%xmm4 \n\t" // scale by beta, and add the gemm result - "vmovlpd (%%r8), %%xmm0, %%xmm0 \n\t" - "vfmaddpd %%xmm5, %%xmm1, %%xmm3, %%xmm5 \n\t" // scale by beta, and add the gemm result - "vmovhpd (%%r8,%%rdi), %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm4, (%%rcx) \n\t" // and store back to memory. - "vmovlpd %%xmm5, (%%rdx) \n\t" // and store back to memory. - "vmovhpd %%xmm4, (%%rcx,%%rdi) \n\t" - "addq %%rsi, %%rcx \n\t" - "vmovhpd %%xmm5, (%%rdx,%%rdi) \n\t" - "addq %%rsi, %%rdx \n\t" - " \n\t" - "vmulpd %%xmm2, %%xmm6, %%xmm6 \n\t" // scale by alpha, - "vfmaddpd %%xmm6, %%xmm0, %%xmm3, %%xmm6 \n\t" // scale by beta, and add the gemm result - "vmovlpd %%xmm6, (%%r8) \n\t" // and store back to memory. - "vmovhpd %%xmm6, (%%r8,%%rdi) \n\t" - "addq %%rsi, %%r8 \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" - "vmovlpd (%%r8), %%xmm4, %%xmm4 \n\t" - "vmovhpd (%%rcx,%%rdi), %%xmm0, %%xmm0 \n\t" - "vmovhpd (%%rdx,%%rdi), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r8,%%rdi), %%xmm4, %%xmm4 \n\t" - "vmulpd %%xmm2, %%xmm7, %%xmm7 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm8, %%xmm8 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm9, %%xmm9 \n\t" // scale by alpha, - "vfmaddpd %%xmm7, %%xmm0, %%xmm3, %%xmm7 \n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm8, %%xmm1, %%xmm3, %%xmm8 \n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm9, %%xmm4, %%xmm3, %%xmm9 \n\t" // scale by beta, and add the gemm result - "vmovlpd %%xmm7, (%%rcx) \n\t" // and store back to memory. - "vmovlpd %%xmm8, (%%rdx) \n\t" // and store back to memory. - "vmovlpd %%xmm9, (%%r8) \n\t" // and store back to memory. - "vmovhpd %%xmm7, (%%rcx,%%rdi) \n\t" - "addq %%rsi, %%rcx \n\t" - "vmovhpd %%xmm8, (%%rdx,%%rdi) \n\t" - "addq %%rsi, %%rdx \n\t" - "vmovhpd %%xmm9, (%%r8,%%rdi) \n\t" - "addq %%rsi, %%r8 \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" - "vmovlpd (%%r8), %%xmm4, %%xmm4 \n\t" - "vmovhpd (%%rcx,%%rdi), %%xmm0, %%xmm0 \n\t" - "vmovhpd (%%rdx,%%rdi), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r8,%%rdi), %%xmm4, %%xmm4 \n\t" - "vmulpd %%xmm2, %%xmm10, %%xmm10 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm11, %%xmm11 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm12, %%xmm12 \n\t" // scale by alpha, - "vfmaddpd %%xmm10, %%xmm0, %%xmm3, %%xmm10 \n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm11, %%xmm1, %%xmm3, %%xmm11 \n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm12, %%xmm4, %%xmm3, %%xmm12 \n\t" // scale by beta, and add the gemm result - "vmovlpd %%xmm10, (%%rcx) \n\t" // and store back to memory. - "vmovlpd %%xmm11, (%%rdx) \n\t" // and store back to memory. - "vmovlpd %%xmm12, (%%r8) \n\t" // and store back to memory. - "vmovhpd %%xmm10, (%%rcx,%%rdi) \n\t" - "addq %%rsi, %%rcx \n\t" - "vmovhpd %%xmm11, (%%rdx,%%rdi) \n\t" - "addq %%rsi, %%rdx \n\t" - "vmovhpd %%xmm12, (%%r8,%%rdi) \n\t" - "addq %%rsi, %%r8 \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" - "vmovlpd (%%r8), %%xmm4, %%xmm4 \n\t" - "vmovhpd (%%rcx,%%rdi), %%xmm0, %%xmm0 \n\t" - "vmovhpd (%%rdx,%%rdi), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r8,%%rdi), %%xmm4, %%xmm4 \n\t" - "vmulpd %%xmm2, %%xmm13, %%xmm13 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm14, %%xmm14 \n\t" // scale by alpha, - "vmulpd %%xmm2, %%xmm15, %%xmm15 \n\t" // scale by alpha, - "vfmaddpd %%xmm13, %%xmm0, %%xmm3, %%xmm13\n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm14, %%xmm1, %%xmm3, %%xmm14\n\t" // scale by beta, and add the gemm result - "vfmaddpd %%xmm15, %%xmm4, %%xmm3, %%xmm15\n\t" // scale by beta, and add the gemm result - "vmovlpd %%xmm13, (%%rcx) \n\t" // and store back to memory. - "vmovlpd %%xmm14, (%%rdx) \n\t" // and store back to memory. - "vmovlpd %%xmm15, (%%r8) \n\t" // and store back to memory. - "vmovhpd %%xmm13, (%%rcx,%%rdi) \n\t" - "vmovhpd %%xmm14, (%%rdx,%%rdi) \n\t" - "vmovhpd %%xmm15, (%%r8,%%rdi) \n\t" + add(imm(6*8), rbx) + add(imm(4*8), rax) + + dec(rsi) + jmp(.LOOPKLEFT) // iterate again if i != 0. + + label(.POSTACCUM) + + + mov(%7, rsi) // load cs_c + mov(%8, rdi) // load rs_c + vmovddup(mem(%4), xmm2) //load alpha + vmovddup(mem(%5), xmm3) //load beta + mov(%6, rcx) // load address of c + sal(imm(3), rsi) // cs_c *= sizeof(double) + sal(imm(3), rdi) // rs_c *= sizeof(double) + lea(mem(rcx, rdi, 2), rdx) + + vmovlpd(mem(rcx), xmm0, xmm0) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) + vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) + lea(mem(rdx, rdi, 2), r8) + vmulpd(xmm2, xmm4, xmm4) // scale by alpha, + vmulpd(xmm2, xmm5, xmm5) // scale by alpha, + vfmaddpd(xmm4, xmm0, xmm3, xmm4) // scale by beta, and add the gemm result + vmovlpd(mem(r8), xmm0, xmm0) + vfmaddpd(xmm5, xmm1, xmm3, xmm5) // scale by beta, and add the gemm result + vmovhpd(mem(r8, rdi, 1), xmm0, xmm0) + vmovlpd(xmm4, mem(rcx)) // and store back to memory. + vmovlpd(xmm5, mem(rdx)) // and store back to memory. + vmovhpd(xmm4, mem(rcx, rdi, 1)) + add(rsi, rcx) + vmovhpd(xmm5, mem(rdx, rdi, 1)) + add(rsi, rdx) + + vmulpd(xmm2, xmm6, xmm6) // scale by alpha, + vfmaddpd(xmm6, xmm0, xmm3, xmm6) // scale by beta, and add the gemm result + vmovlpd(xmm6, mem(r8)) // and store back to memory. + vmovhpd(xmm6, mem(r8, rdi, 1)) + add(rsi, r8) + + + vmovlpd(mem(rcx), xmm0, xmm0) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovlpd(mem(r8), xmm4, xmm4) + vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) + vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) + vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) + vmulpd(xmm2, xmm7, xmm7) // scale by alpha, + vmulpd(xmm2, xmm8, xmm8) // scale by alpha, + vmulpd(xmm2, xmm9, xmm9) // scale by alpha, + vfmaddpd(xmm7, xmm0, xmm3, xmm7) // scale by beta, and add the gemm result + vfmaddpd(xmm8, xmm1, xmm3, xmm8) // scale by beta, and add the gemm result + vfmaddpd(xmm9, xmm4, xmm3, xmm9) // scale by beta, and add the gemm result + vmovlpd(xmm7, mem(rcx)) // and store back to memory. + vmovlpd(xmm8, mem(rdx)) // and store back to memory. + vmovlpd(xmm9, mem(r8)) // and store back to memory. + vmovhpd(xmm7, mem(rcx, rdi, 1)) + add(rsi, rcx) + vmovhpd(xmm8, mem(rdx, rdi, 1)) + add(rsi, rdx) + vmovhpd(xmm9, mem(r8, rdi, 1)) + add(rsi, r8) + + + vmovlpd(mem(rcx), xmm0, xmm0) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovlpd(mem(r8), xmm4, xmm4) + vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) + vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) + vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) + vmulpd(xmm2, xmm10, xmm10) // scale by alpha, + vmulpd(xmm2, xmm11, xmm11) // scale by alpha, + vmulpd(xmm2, xmm12, xmm12) // scale by alpha, + vfmaddpd(xmm10, xmm0, xmm3, xmm10) // scale by beta, and add the gemm result + vfmaddpd(xmm11, xmm1, xmm3, xmm11) // scale by beta, and add the gemm result + vfmaddpd(xmm12, xmm4, xmm3, xmm12) // scale by beta, and add the gemm result + vmovlpd(xmm10, mem(rcx)) // and store back to memory. + vmovlpd(xmm11, mem(rdx)) // and store back to memory. + vmovlpd(xmm12, mem(r8)) // and store back to memory. + vmovhpd(xmm10, mem(rcx, rdi, 1)) + add(rsi, rcx) + vmovhpd(xmm11, mem(rdx, rdi, 1)) + add(rsi, rdx) + vmovhpd(xmm12, mem(r8, rdi, 1)) + add(rsi, r8) + + + vmovlpd(mem(rcx), xmm0, xmm0) + vmovlpd(mem(rdx), xmm1, xmm1) + vmovlpd(mem(r8), xmm4, xmm4) + vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) + vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) + vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) + vmulpd(xmm2, xmm13, xmm13) // scale by alpha, + vmulpd(xmm2, xmm14, xmm14) // scale by alpha, + vmulpd(xmm2, xmm15, xmm15) // scale by alpha, + vfmaddpd(xmm13, xmm0, xmm3, xmm13) // scale by beta, and add the gemm result + vfmaddpd(xmm14, xmm1, xmm3, xmm14) // scale by beta, and add the gemm result + vfmaddpd(xmm15, xmm4, xmm3, xmm15) // scale by beta, and add the gemm result + vmovlpd(xmm13, mem(rcx)) // and store back to memory. + vmovlpd(xmm14, mem(rdx)) // and store back to memory. + vmovlpd(xmm15, mem(r8)) // and store back to memory. + vmovhpd(xmm13, mem(rcx, rdi, 1)) + vmovhpd(xmm14, mem(rdx, rdi, 1)) + vmovhpd(xmm15, mem(r8, rdi, 1)) : // output operands (none) : // input operands @@ -1053,21 +1056,21 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 } //The parameter "i" is the iteration number, i.e. the B values to read #define MADD_TO_YMM(i) \ - "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15 \n\t"\ - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"\ - "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13 \n\t"\ - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"\ - "vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14 \n\t"\ - "vmovshdup "i" * 32(%%rbx), %%ymm2 \n\t"\ - "vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12 \n\t"\ - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"\ - "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11 \n\t"\ - "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9 \n\t"\ - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"\ - "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10 \n\t"\ - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"\ - "vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8 \n\t"\ - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"\ + vfmaddps(ymm15, ymm0, ymm2, ymm15)\ + vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ + vfmaddps(ymm13, ymm0, ymm3, ymm13)\ + vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ + vfmaddps(ymm14, ymm1, ymm2, ymm14)\ + vmovshdup(mem(rbx, i*32), ymm2)\ + vfmaddps(ymm12, ymm1, ymm3, ymm12)\ + vpermilps(imm(0x4e), ymm2, ymm3)\ + vfmaddps(ymm11, ymm0, ymm4, ymm11)\ + vfmaddps(ymm9, ymm0, ymm5, ymm9)\ + vpermilps(imm(0xb1), ymm0, ymm0)\ + vfmaddps(ymm10, ymm1, ymm4, ymm10)\ + vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ + vfmaddps(ymm8, ymm1, ymm5, ymm8)\ + vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ void bli_cgemm_bulldozer_asm_8x4_fma4 ( @@ -1093,744 +1096,744 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 __asm__ volatile ( - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - //"movq %10, %%r14 \n\t" // load address of a_next. - "addq $-4 * 64, %%r15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - " \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - ".CLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 8 * 32(%%rax) \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - MADD_TO_YMM("0") - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "prefetcht0 10 * 32(%%rax) \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - MADD_TO_YMM("1") - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 4 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 12 * 32(%%rax) \n\t" - "vmovaps 5 * 32(%%rax), %%ymm1 \n\t" - MADD_TO_YMM("2") - "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4] - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 6 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "prefetcht0 14 * 32(%%rax) \n\t" - "vmovaps 7 * 32(%%rax), %%ymm1 \n\t" - MADD_TO_YMM("3") - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 4 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 8 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $8 * 4 * 8, %%rax \n\t" // a += 8*4 (unroll x mr) - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".CLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 8 * 32(%%rax) \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - MADD_TO_YMM("0") - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) - "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab21 ab20 ab23 ab22 - " \n\t" // ab31 ab30 ab33 ab32 - " \n\t" // ab42 ab43 ab40 ab41 - " \n\t" // ab52 ab53 ab50 ab51 - " \n\t" // ab63 ab62 ab61 ab60 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba1 aba0 aba3 aba2 - " \n\t" // abb1 abb0 abb3 abb2 - " \n\t" // abc2 abc3 abc0 abc1 - " \n\t" // abd2 abd3 abd0 abd1 - " \n\t" // abe3 abe2 abe1 abe0 - " \n\t" // abf3 abf2 abf1 abf0 ) + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + //mov(%10, r14) // load address of a_next. + sub(imm(4*64), r15) + + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovsldup(mem(rbx, 0*32), ymm2) + vpermilps(imm(0x4e), ymm2, ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.CLOOPKITER) // MAIN LOOP + + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) + + // iteration 0 + prefetch(0, mem(rax, 8*32)) + vmovaps(mem(rax, 1*32), ymm1) + MADD_TO_YMM(0) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vaddsubps(ymm6, ymm15, ymm15) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 2*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + // iteration 1 + prefetch(0, mem(rax, 10*32)) + vmovaps(mem(rax, 3*32), ymm1) + MADD_TO_YMM(1) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 2*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 4*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + // iteration 2 + prefetch(0, mem(rax, 12*32)) + vmovaps(mem(rax, 5*32), ymm1) + MADD_TO_YMM(2) + prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 3*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 6*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + // iteration 3 + prefetch(0, mem(rax, 14*32)) + vmovaps(mem(rax, 7*32), ymm1) + MADD_TO_YMM(3) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 4*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 8*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + add(imm(8*4*8), rax) // a += 8*4 (unroll x mr) + add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) + + + dec(rsi) // i -= 1; + jne(.CLOOPKITER) // iterate again if i != 0. + + + + label(.CCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.CLOOPKLEFT) // EDGE LOOP + + // iteration 0 + prefetch(0, mem(rax, 8*32)) + vmovaps(mem(rax, 1*32), ymm1) + MADD_TO_YMM(0) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 2*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + add(imm(8*1*8), rax) // a += 8 (1 x mr) + add(imm(4*1*8), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.CLOOPKLEFT) // iterate again if i != 0. + + + + label(.CPOSTACCUM) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ab30 ab33 ab32 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab63 ab62 ab61 ab60 + // ab73 ) ab72 ) ab71 ) ab70 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba1 aba0 aba3 aba2 + // abb1 abb0 abb3 abb2 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe3 abe2 abe1 abe0 + // abf3 abf2 abf1 abf0 ) GROUP_YMM_BY_4 - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ab31 ab32 ab33 - " \n\t" // ab42 ab43 ab40 ab41 - " \n\t" // ab52 ab53 ab50 ab51 - " \n\t" // ab62 ab63 ab60 ab61 - " \n\t" // ab72 ) ab73 ) ab70 ) ab71 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba0 aba1 aba2 aba3 - " \n\t" // abb0 abb1 abb2 abb3 - " \n\t" // abc2 abc3 abc0 abc1 - " \n\t" // abd2 abd3 abd0 abd1 - " \n\t" // abe2 abe3 abe0 abe1 - " \n\t" // abf2 ) abf3 ) abf0 ) abf1 ) - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ab31 ab32 ab33 - " \n\t" // ab40 ab41 ab42 ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba0 aba1 aba2 aba3 - " \n\t" // abb0 abb1 abb2 abb3 - " \n\t" // abc0 abc1 abc2 abc3 - " \n\t" // abd0 abd1 abd2 abd3 - " \n\t" // abe0 abe1 abe2 abe3 - " \n\t" // abf0 ) abf1 ) abf2 ) abf3 ) - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastss (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate - "vbroadcastss 4(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate - " \n\t" - "vpermilps $0xb1, %%ymm15, %%ymm3 \n\t" - "vmulps %%ymm7, %%ymm15, %%ymm15 \n\t" - "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm15, %%ymm15 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm14, %%ymm2 \n\t" - "vmulps %%ymm7, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm13, %%ymm1 \n\t" - "vmulps %%ymm7, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubps %%ymm1, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm12, %%ymm0 \n\t" - "vmulps %%ymm7, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubps %%ymm0, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm11, %%ymm3 \n\t" - "vmulps %%ymm7, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm10, %%ymm2 \n\t" - "vmulps %%ymm7, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm9, %%ymm1 \n\t" - "vmulps %%ymm7, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubps %%ymm1, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm8, %%ymm0 \n\t" - "vmulps %%ymm7, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubps %%ymm0, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate - "vbroadcastss 4(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomiss %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORED: \n\t" - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c00,10) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c20,30) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c40,50) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c60,70) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c00,c10) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c80,90) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca0,b0) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc0,d0) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce0,f0) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c80,c90) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c01,11) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c21,31) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c41,51) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c61,71) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c01,c11) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c81,91) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca1,b1) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc1,d1) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce1,f1) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c81,c91) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c02,12) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c22,32) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c42,52) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c62,72) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c02,c12) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c82,92) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca2,b2) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc2,d2) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce2,f2) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c82,c92) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c03,13) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c23,33) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c43,53) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c63,73) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c03,c13) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c83,93) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca3,b3) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc3,d3) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce3,f3) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c83,c93) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORED: \n\t" - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c00:c70 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c80:cf0 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c01:c71 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c81:cf1 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c02:c72 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - "vmovaps (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c82:cf2 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - "vmovaps (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c03:c73 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - "vmovaps (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c83:cf3 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - ".CBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - ".CGENSTORBZ: \n\t" - " \n\t" // update c00:c70 - "vextractf128 $1, %%ymm15, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm15, (%%rcx) \n\t" // store (c00,c10) - "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - "vextractf128 $1, %%ymm14, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm14, (%%rdx) \n\t" // store (c80,c90) - "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - "vextractf128 $1, %%ymm13, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm13, (%%rcx) \n\t" // store (c01,c11) - "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - "vextractf128 $1, %%ymm12, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm12, (%%rdx) \n\t" // store (c81,c91) - "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - "vextractf128 $1, %%ymm11, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm11, (%%rcx) \n\t" // store (c02,c12) - "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - "vextractf128 $1, %%ymm10, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm10, (%%rdx) \n\t" // store (c82,c92) - "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - "vextractf128 $1, %%ymm9, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm9, (%%rcx) \n\t" // store (c03,c13) - "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - "vextractf128 $1, %%ymm8, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm8, (%%rdx) \n\t" // store (c83,c93) - "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - ".CCOLSTORBZ: \n\t" - " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // store c00:c70 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm14, (%%rdx) \n\t" // store c80:cf0 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // store c01:c71 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm12, (%%rdx) \n\t" // store c81:cf1 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // store c02:c72 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm10, (%%rdx) \n\t" // store c82:cf2 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // store c03:c73 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovaps %%ymm8, (%%rdx) \n\t" // store c83:cf3 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - ".CDONE: \n\t" - " \n\t" + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab62 ab63 ab60 ab61 + // ab72 ) ab73 ) ab70 ) ab71 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe2 abe3 abe0 abe1 + // abf2 ) abf3 ) abf0 ) abf1 ) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab40 ab41 ab42 ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc0 abc1 abc2 abc3 + // abd0 abd1 abd2 abd3 + // abe0 abe1 abe2 abe3 + // abf0 ) abf1 ) abf2 ) abf3 ) + + // scale by alpha + + mov(%4, rax) // load address of alpha + vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate + vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate + + vpermilps(imm(0xb1), ymm15, ymm3) + vmulps(ymm7, ymm15, ymm15) + vmulps(ymm6, ymm3, ymm3) + vaddsubps(ymm3, ymm15, ymm15) + + vpermilps(imm(0xb1), ymm14, ymm2) + vmulps(ymm7, ymm14, ymm14) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm14, ymm14) + + vpermilps(imm(0xb1), ymm13, ymm1) + vmulps(ymm7, ymm13, ymm13) + vmulps(ymm6, ymm1, ymm1) + vaddsubps(ymm1, ymm13, ymm13) + + vpermilps(imm(0xb1), ymm12, ymm0) + vmulps(ymm7, ymm12, ymm12) + vmulps(ymm6, ymm0, ymm0) + vaddsubps(ymm0, ymm12, ymm12) + + vpermilps(imm(0xb1), ymm11, ymm3) + vmulps(ymm7, ymm11, ymm11) + vmulps(ymm6, ymm3, ymm3) + vaddsubps(ymm3, ymm11, ymm11) + + vpermilps(imm(0xb1), ymm10, ymm2) + vmulps(ymm7, ymm10, ymm10) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm10, ymm10) + + vpermilps(imm(0xb1), ymm9, ymm1) + vmulps(ymm7, ymm9, ymm9) + vmulps(ymm6, ymm1, ymm1) + vaddsubps(ymm1, ymm9, ymm9) + + vpermilps(imm(0xb1), ymm8, ymm0) + vmulps(ymm7, ymm8, ymm8) + vmulps(ymm6, ymm0, ymm0) + vaddsubps(ymm0, ymm8, ymm8) + + + + + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate + vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate + + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; + + + + // determine if + // c % 32 == 0, AND + // 8*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomiss(xmm0, xmm6) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.CCOLSTORED) // jump to column storage case + + + + label(.CGENSTORED) + + // update c00:c70 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c00,c10) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c80,c90) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c01,c11) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c81,c91) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c02,c12) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c82,c92) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c03,c13) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c83,c93) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) + add(rdi, rdx) // c += cs_c; + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORED) + + // update c00:c70 + + vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vmovaps(mem(rdx), ymm0) // load c80:f0 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + // update c00:c70 + + vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vmovaps(mem(rdx), ymm0) // load c81:f1 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + vmovaps(mem(rdx), ymm0) // load c82:f2 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + vmovaps(mem(rdx), ymm0) // load c83:f3 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + + jmp(.CDONE) // jump to end. + + + label(.CBETAZERO) + // check if aligned/column-stored + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.CCOLSTORBZ) // jump to column storage case + + + label(.CGENSTORBZ) + // update c00:c70 + vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm15, mem(rcx)) // store (c00,c10) + vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm14, mem(rdx)) // store (c80,c90) + vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm13, mem(rcx)) // store (c01,c11) + vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm12, mem(rdx)) // store (c81,c91) + vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm11, mem(rcx)) // store (c02,c12) + vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm10, mem(rdx)) // store (c82,c92) + vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm9, mem(rcx)) // store (c03,c13) + vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm8, mem(rdx)) // store (c83,c93) + vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) + add(rdi, rdx) // c += cs_c; + + + jmp(.CDONE) // jump to end. + + + label(.CCOLSTORBZ) + + vmovaps(ymm15, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm14, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + vmovaps(ymm13, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm12, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + vmovaps(ymm11, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm10, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + vmovaps(ymm9, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm8, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + + + + label(.CDONE) + : // output operands (none) : // input operands @@ -1857,28 +1860,28 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 } #define MADDSUBPD_TO_YMM \ - "vfmaddpd %%ymm13, %%ymm0, %%ymm4, %%ymm13\n\t"\ - "vfmaddpd %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t"\ - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"\ - " \n\t"\ - "vfmaddpd %%ymm12, %%ymm1, %%ymm4, %%ymm12\n\t"\ - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"\ - "vfmaddpd %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t"\ - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"\ - " \n\t"\ - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"\ - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"\ - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"\ - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"\ - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"\ - " \n\t"\ + vfmaddpd(ymm13, ymm0, ymm4, ymm13)\ + vfmaddpd(ymm9, ymm0, ymm5, ymm9)\ + vpermilpd(imm(0x5), ymm0, ymm0)\ + \ + vfmaddpd(ymm12, ymm1, ymm4, ymm12)\ + vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ + vfmaddpd(ymm8, ymm1, ymm5, ymm8)\ + vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ + \ + vpermilpd(imm(0x5), ymm1, ymm1)\ + vmulpd(ymm0, ymm2, ymm6)\ + vmulpd(ymm0, ymm3, ymm7)\ + vaddsubpd(ymm6, ymm15, ymm15)\ + vaddsubpd(ymm7, ymm11, ymm11)\ + \ #define Z_ALPHA(i, j) \ - "vpermilpd $0x5, %%ymm"i", %%ymm"j" \n\t"\ - "vmulpd %%ymm7, %%ymm"i", %%ymm"i" \n\t"\ - "vmulpd %%ymm6, %%ymm"j", %%ymm"j" \n\t"\ - "vaddsubpd %%ymm"j", %%ymm"i", %%ymm"i" \n\t"\ - " \n\t" + vpermilpd(imm(0x5), ymm(i), ymm(j))\ + vmulpd(ymm7, ymm(i), ymm(i))\ + vmulpd(ymm6, ymm(j), ymm(j))\ + vaddsubpd(ymm(j), ymm(i), ymm(i))\ + void bli_zgemm_bulldozer_asm_4x4_fma4 ( @@ -1904,606 +1907,606 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - //"movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovddup 0 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vmovddup 0 + 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) - "leaq (,%%rdi,2), %%rdi \n\t" - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - " \n\t" - "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - ".ZLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vfmaddpd %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vfmaddpd %%ymm11, %%ymm0, %%ymm3, %%ymm11\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vfmaddpd %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddpd %%ymm10, %%ymm1, %%ymm3, %%ymm10\n\t" - "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + //mov(%10, r14) // load address of a_next. + + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovddup(mem(rbx, 0+0*32), ymm2) + vmovddup(mem(rbx, 0+1*32), ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) + lea(mem(, rdi, 2), rdi) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + + vxorpd(ymm8, ymm8, ymm8) + vxorpd(ymm9, ymm9, ymm9) + vxorpd(ymm10, ymm10, ymm10) + vxorpd(ymm11, ymm11, ymm11) + vxorpd(ymm12, ymm12, ymm12) + vxorpd(ymm13, ymm13, ymm13) + vxorpd(ymm14, ymm14, ymm14) + vxorpd(ymm15, ymm15, ymm15) + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.ZCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + label(.ZLOOPKITER) // MAIN LOOP + + // iteration 0 + vmovapd(mem(rax, 1*32), ymm1) + vfmaddpd(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vfmaddpd(ymm11, ymm0, ymm3, ymm11) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + prefetch(0, mem(rax, 16*32)) + vfmaddpd(ymm14, ymm1, ymm2, ymm14) + vmovddup(mem(rbx, 8+0*32), ymm2) + vfmaddpd(ymm10, ymm1, ymm3, ymm10) + vmovddup(mem(rbx, 8+1*32), ymm3) + MADDSUBPD_TO_YMM - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" - "vfmaddpd %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vfmaddpd %%ymm11, %%ymm0, %%ymm3, %%ymm11\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "prefetcht0 18 * 32(%%rax) \n\t" - "vfmaddpd %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vmovddup 8 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddpd %%ymm10, %%ymm1, %%ymm3, %%ymm10\n\t" - "vmovddup 8 + 3 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+3*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 2*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + // iteration 1 + vmovapd(mem(rax, 3*32), ymm1) + vfmaddpd(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vfmaddpd(ymm11, ymm0, ymm3, ymm11) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + prefetch(0, mem(rax, 18*32)) + vfmaddpd(ymm14, ymm1, ymm2, ymm14) + vmovddup(mem(rbx, 8+2*32), ymm2) + vfmaddpd(ymm10, ymm1, ymm3, ymm10) + vmovddup(mem(rbx, 8+3*32), ymm3) + MADDSUBPD_TO_YMM - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 4 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 5 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" - "vfmaddpd %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vfmaddpd %%ymm11, %%ymm0, %%ymm3, %%ymm11\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "prefetcht0 20 * 32(%%rax) \n\t" - "vfmaddpd %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vmovddup 8 + 4 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddpd %%ymm10, %%ymm1, %%ymm3, %%ymm10\n\t" - "vmovddup 8 + 5 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+4*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+5*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 4*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + // iteration 2 + vmovapd(mem(rax, 5*32), ymm1) + vfmaddpd(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vfmaddpd(ymm11, ymm0, ymm3, ymm11) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + prefetch(0, mem(rax, 20*32)) + vfmaddpd(ymm14, ymm1, ymm2, ymm14) + vmovddup(mem(rbx, 8+4*32), ymm2) + vfmaddpd(ymm10, ymm1, ymm3, ymm10) + vmovddup(mem(rbx, 8+5*32), ymm3) + MADDSUBPD_TO_YMM - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 6 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 7 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" - "vfmaddpd %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vfmaddpd %%ymm11, %%ymm0, %%ymm3, %%ymm11\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "prefetcht0 22 * 32(%%rax) \n\t" - "vfmaddpd %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vmovddup 8 + 6 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddpd %%ymm10, %%ymm1, %%ymm3, %%ymm10\n\t" - "vmovddup 8 + 7 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+6*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+7*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 6*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + // iteration 3 + vmovapd(mem(rax, 7*32), ymm1) + vfmaddpd(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vfmaddpd(ymm11, ymm0, ymm3, ymm11) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + prefetch(0, mem(rax, 22*32)) + vfmaddpd(ymm14, ymm1, ymm2, ymm14) + vmovddup(mem(rbx, 8+6*32), ymm2) + vfmaddpd(ymm10, ymm1, ymm3, ymm10) + vmovddup(mem(rbx, 8+7*32), ymm3) + MADDSUBPD_TO_YMM - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 8 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 9 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 8 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) - "addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr) - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - ".ZCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vfmaddpd %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vfmaddpd %%ymm11, %%ymm0, %%ymm3, %%ymm11\n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vfmaddpd %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" - "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmaddpd %%ymm10, %%ymm1, %%ymm3, %%ymm10\n\t" - "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+8*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+9*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 8*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) + add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) + + dec(rsi) // i -= 1; + jne(.ZLOOPKITER) // iterate again if i != 0. + + + label(.ZCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.ZLOOPKLEFT) // EDGE LOOP + + // iteration 0 + vmovapd(mem(rax, 1*32), ymm1) + vfmaddpd(ymm15, ymm0, ymm2, ymm15) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vfmaddpd(ymm11, ymm0, ymm3, ymm11) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + + prefetch(0, mem(rax, 16*32)) + vfmaddpd(ymm14, ymm1, ymm2, ymm14) + vmovddup(mem(rbx, 8+0*32), ymm2) + vfmaddpd(ymm10, ymm1, ymm3, ymm10) + vmovddup(mem(rbx, 8+1*32), ymm3) + MADDSUBPD_TO_YMM - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $4 * 1 * 16, %%rax \n\t" // a += 4 (1 x mr) - "addq $4 * 1 * 16, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - ".ZPOSTACCUM: \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab21 ab20 ab23 ab22 - " \n\t" // ab31 ) ab30 ) ab33 ) ab32 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab61 ab60 ab63 ab62 - " \n\t" // ab71 ) ab70 ) ab73 ) ab72 ) - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm15, %%ymm13, %%ymm15 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm11, %%ymm9, %%ymm11 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm14, %%ymm12, %%ymm14 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm10, %%ymm8, %%ymm10 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastsd (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate - "vbroadcastsd 8(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate - " \n\t" - Z_ALPHA("15", "3") - Z_ALPHA("14", "2") - Z_ALPHA("13", "1") - Z_ALPHA("12", "0") + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+3*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 2*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + add(imm(4*1*16), rax) // a += 4 (1 x mr) + add(imm(4*1*16), rbx) // b += 4 (1 x nr) + + dec(rsi) // i -= 1; + jne(.ZLOOPKLEFT) // iterate again if i != 0. + + + label(.ZPOSTACCUM) + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ) ab30 ) ab33 ) ab32 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab61 ab60 ab63 ab62 + // ab71 ) ab70 ) ab73 ) ab72 ) + + vmovapd(ymm15, ymm7) + vperm2f128(imm(0x12), ymm15, ymm13, ymm15) + vperm2f128(imm(0x30), ymm7, ymm13, ymm13) + + vmovapd(ymm11, ymm7) + vperm2f128(imm(0x12), ymm11, ymm9, ymm11) + vperm2f128(imm(0x30), ymm7, ymm9, ymm9) + + vmovapd(ymm14, ymm7) + vperm2f128(imm(0x12), ymm14, ymm12, ymm14) + vperm2f128(imm(0x30), ymm7, ymm12, ymm12) + + vmovapd(ymm10, ymm7) + vperm2f128(imm(0x12), ymm10, ymm8, ymm10) + vperm2f128(imm(0x30), ymm7, ymm8, ymm8) + + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ) ab31 ) ab32 ) ab33 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + + // scale by alpha + + mov(%4, rax) // load address of alpha + vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate + vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate + + Z_ALPHA(15, 3) + Z_ALPHA(14, 2) + Z_ALPHA(13, 1) + Z_ALPHA(12, 0) - Z_ALPHA("11", "3") - Z_ALPHA("10", "2") - Z_ALPHA("9", "1") - Z_ALPHA("8", "0") - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate - "vbroadcastsd 8(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex) - "leaq (,%%rsi,2), %%rsi \n\t" - "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 16*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (16*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomisd %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORED: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c00,c10) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c20,c30) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c00,c10) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c40,c50) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c60,c70) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c40,c50) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c01,c11) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c21,c31) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c01,c11) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c41,c51) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c61,c71) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c41,c51) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c02,c12) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c22,c32) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c02,c12) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c42,c52) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c62,c72) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c42,c52) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c03,c13) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c23,c33) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c03,c13) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c43,c53) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c63,c73) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c43,c53) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73) - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORED: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c00:c30 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c40:c70 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c01:c31 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c41:c71 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c02:c32 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c42:c72 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c03:c33 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 - Z_ALPHA("0", "2") // scale ymm0 by beta - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c43:c73 - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORBZ: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vextractf128 $1, %%ymm15, %%xmm2 \n\t" - "vmovupd %%xmm15, (%%rcx) \n\t" // store (c00,c10) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vextractf128 $1, %%ymm14, %%xmm2 \n\t" - "vmovupd %%xmm14, (%%rdx) \n\t" // store (c40,c50) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vextractf128 $1, %%ymm13, %%xmm2 \n\t" - "vmovupd %%xmm13, (%%rcx) \n\t" // store (c01,c11) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vextractf128 $1, %%ymm12, %%xmm2 \n\t" - "vmovupd %%xmm12, (%%rdx) \n\t" // store (c41,c51) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vextractf128 $1, %%ymm11, %%xmm2 \n\t" - "vmovupd %%xmm11, (%%rcx) \n\t" // store (c02,c12) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vextractf128 $1, %%ymm10, %%xmm2 \n\t" - "vmovupd %%xmm10, (%%rdx) \n\t" // store (c42,c52) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vextractf128 $1, %%ymm9, %%xmm2 \n\t" - "vmovupd %%xmm9, (%%rcx) \n\t" // store (c03,c13) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vextractf128 $1, %%ymm8, %%xmm2 \n\t" - "vmovupd %%xmm8, (%%rdx) \n\t" // store (c43,c53) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73) - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - ".ZCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm15, (%%rcx) \n\t" // store c00:c30 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" // store c40:c70 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" // store c01:c31 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm12, (%%rdx) \n\t" // store c41:c71 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm11, (%%rcx) \n\t" // store c02:c32 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm10, (%%rdx) \n\t" // store c42:c72 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm9, (%%rcx) \n\t" // store c03:c33 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" // store c43:c73 - " \n\t" - " \n\t" - ".ZDONE: \n\t" - " \n\t" + Z_ALPHA(11, 3) + Z_ALPHA(10, 2) + Z_ALPHA(9, 1) + Z_ALPHA(8, 0) + + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate + vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) + lea(mem(, rsi, 2), rsi) + lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; + + + + // determine if + // c % 32 == 0, AND + // 16*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm7) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomisd(xmm0, xmm6) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.ZCOLSTORED) // jump to column storage case + + + + label(.ZGENSTORED) + // update c00:c30 + + vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c00,c10) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c40,c50) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c01,c11) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c41,c51) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c02,c12) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c42,c52) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c03,c13) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c43,c53) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORED) + // update c00:c30 + + vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vmovapd(mem(rdx), ymm0) // load c40:c70 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vmovapd(mem(rdx), ymm0) // load c41:c71 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vmovapd(mem(rdx), ymm0) // load c42:c72 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vmovapd(mem(rdx), ymm0) // load c43:c73 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rdx)) // store c43:c73 + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZBETAZERO) + // check if aligned/column-stored + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.ZCOLSTORBZ) // jump to column storage case + + + + label(.ZGENSTORBZ) + // update c00:c30 + + vextractf128(imm(1), ymm15, xmm2) + vmovupd(xmm15, mem(rcx)) // store (c00,c10) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vextractf128(imm(1), ymm14, xmm2) + vmovupd(xmm14, mem(rdx)) // store (c40,c50) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vextractf128(imm(1), ymm13, xmm2) + vmovupd(xmm13, mem(rcx)) // store (c01,c11) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vextractf128(imm(1), ymm12, xmm2) + vmovupd(xmm12, mem(rdx)) // store (c41,c51) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vextractf128(imm(1), ymm11, xmm2) + vmovupd(xmm11, mem(rcx)) // store (c02,c12) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vextractf128(imm(1), ymm10, xmm2) + vmovupd(xmm10, mem(rdx)) // store (c42,c52) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vextractf128(imm(1), ymm9, xmm2) + vmovupd(xmm9, mem(rcx)) // store (c03,c13) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vextractf128(imm(1), ymm8, xmm2) + vmovupd(xmm8, mem(rdx)) // store (c43,c53) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) + + + jmp(.ZDONE) // jump to end. + + + label(.ZCOLSTORBZ) + + + vmovapd(ymm15, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm14, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + vmovapd(ymm13, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm12, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + vmovapd(ymm11, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm10, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + vmovapd(ymm9, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm8, mem(rdx)) // store c43:c73 + + + label(.ZDONE) + : // output operands (none) : // input operands @@ -2528,3 +2531,4 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 "memory" ); } + diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c index e88978000..29ee9cdc9 100644 --- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c +++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c @@ -32,9 +32,11 @@ */ -#include "bli_avx512_macros.h" #include "blis.h" +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" + #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ @@ -125,156 +127,157 @@ void bli_dpackm_knl_asm_8xk const int64_t lda = lda_; const int64_t ldp = ldp_; - __asm__ volatile - ( - MOV(RSI, VAR(n)) - MOV(RAX, VAR(a)) - MOV(RBX, VAR(inca)) - MOV(RCX, VAR(lda)) - MOV(R14, VAR(p)) - MOV(RDI, VAR(ldp)) + BEGIN_ASM - TEST(RSI, RSI) + MOV(RSI, VAR(n)) + MOV(RAX, VAR(a)) + MOV(RBX, VAR(inca)) + MOV(RCX, VAR(lda)) + MOV(R14, VAR(p)) + MOV(RDI, VAR(ldp)) + + TEST(RSI, RSI) + JZ(PACK8_DONE) + + LEA(RBX, MEM(,RBX,8)) //inca in bytes + LEA(RCX, MEM(,RCX,8)) //lda in bytes + LEA(RDI, MEM(,RDI,8)) //ldp in bytes + LEA(R11, MEM(RDI,RDI,2)) //ldp*3 + LEA(R12, MEM(RDI,RDI,4)) //ldp*5 + LEA(R13, MEM(R11,RDI,4)) //ldp*7 + + VBROADCASTSD(ZMM(31), VAR(kappa)) + + CMP(RBX, IMM(8)) + JNE(PACK8_T) + + LABEL(PACK8_N) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK8_N_TAIL) + + LEA(R8, MEM(RCX,RCX,2)) //lda*3 + LEA(R9, MEM(RCX,RCX,4)) //lda*5 + LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + + LABEL(PACK8_N_LOOP) + + LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(R14, MEM(R14,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK8_N_LOOP) + + TEST(RDX, RDX) JZ(PACK8_DONE) - LEA(RBX, MEM(,RBX,8)) //inca in bytes - LEA(RCX, MEM(,RCX,8)) //lda in bytes - LEA(RDI, MEM(,RDI,8)) //ldp in bytes - LEA(R11, MEM(RDI,RDI,2)) //ldp*3 - LEA(R12, MEM(RDI,RDI,4)) //ldp*5 - LEA(R13, MEM(R11,RDI,4)) //ldp*7 + LABEL(PACK8_N_TAIL) - VBROADCASTSD(ZMM(31), VAR(kappa)) + VMULPD(ZMM(0), ZMM(31), MEM(RAX)) + VMOVUPD(MEM(R14), ZMM(0)) - CMP(RBX, IMM(8)) - JNE(PACK8_T) + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14,RDI,1)) - LABEL(PACK8_N) + SUB(RDX, IMM(1)) - MOV(RDX, RSI) - AND(RDX, IMM(7)) - SAR(RSI, IMM(3)) - JZ(PACK8_N_TAIL) + JNZ(PACK8_N_TAIL) - LEA(R8, MEM(RCX,RCX,2)) //lda*3 - LEA(R9, MEM(RCX,RCX,4)) //lda*5 - LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + JMP(PACK8_DONE) - LABEL(PACK8_N_LOOP) + LABEL(PACK8_T) - LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) - STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) + CMP(RCX, IMM(8)) + JNE(PACK8_G) - LEA(RAX, MEM(RAX,RCX,8)) - LEA(R14, MEM(R14,RDI,8)) + LEA(R8, MEM(RBX,RBX,2)) //inca*3 + LEA(R9, MEM(RBX,RBX,4)) //inca*5 + LEA(R10, MEM(R8 ,RBX,4)) //inca*7 - SUB(RSI, IMM(1)) + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK8_T_TAIL) - JNZ(PACK8_N_LOOP) + LABEL(PACK8_T_LOOP) - TEST(RDX, RDX) - JZ(PACK8_DONE) - - LABEL(PACK8_N_TAIL) - - VMULPD(ZMM(0), ZMM(31), MEM(RAX)) - VMOVUPD(MEM(R14), ZMM(0)) - - LEA(RAX, MEM(RAX,RCX,1)) - LEA(R14, MEM(R14,RDI,1)) - - SUB(RDX, IMM(1)) - - JNZ(PACK8_N_TAIL) - - JMP(PACK8_DONE) - - LABEL(PACK8_T) - - CMP(RCX, IMM(8)) - JNE(PACK8_G) - - LEA(R8, MEM(RBX,RBX,2)) //inca*3 - LEA(R9, MEM(RBX,RBX,4)) //inca*5 - LEA(R10, MEM(R8 ,RBX,4)) //inca*7 - - MOV(RDX, RSI) - AND(RDX, IMM(7)) - SAR(RSI, IMM(3)) - JZ(PACK8_T_TAIL) - - LABEL(PACK8_T_LOOP) - - LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) - TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, - 16,17,18,19,20,21,22,23) - STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) - - LEA(RAX, MEM(RAX,RCX,8)) - LEA(R14, MEM(R14,RDI,8)) - - SUB(RSI, IMM(1)) - - JNZ(PACK8_T_LOOP) - - TEST(RDX, RDX) - JZ(PACK8_DONE) - - LABEL(PACK8_T_TAIL) - - MOV(RSI, IMM(1)) - SHLX(RSI, RSI, RDX) - SUB(RSI, IMM(1)) - KMOV(K(1), ESI) //mask for n%8 elements - - LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1) + LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9,10,11,12,13,14,15) + 16,17,18,19,20,21,22,23) + STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) - VMOVUPD(MEM(R14 ), ZMM( 8)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,RDI,1), ZMM( 9)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,RDI,2), ZMM(10)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,R11,1), ZMM(11)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,RDI,4), ZMM(12)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,R12,1), ZMM(13)) - SUB(RDX, IMM(1)) - JZ(PACK8_DONE) - VMOVUPD(MEM(R14,R11,2), ZMM(14)) + LEA(RAX, MEM(RAX,RCX,8)) + LEA(R14, MEM(R14,RDI,8)) - JMP(PACK8_DONE) + SUB(RSI, IMM(1)) - LABEL(PACK8_G) + JNZ(PACK8_T_LOOP) - VPBROADCASTD(ZMM(3), VAR(inca)) - MOV(RBX, VAR(offsetPtr)) - VPMULLD(YMM(0), YMM(3), MEM(RBX)) + TEST(RDX, RDX) + JZ(PACK8_DONE) - LABEL(PACK8_G_LOOP) + LABEL(PACK8_T_TAIL) - KXNORW(K(1), K(0), K(0)) - VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) - VMULPD(ZMM(3), ZMM(3), ZMM(31)) - VMOVUPD(MEM(R14), ZMM(3)) + MOV(RSI, IMM(1)) + SHLX(RSI, RSI, RDX) + SUB(RSI, IMM(1)) + KMOVW(K(1), ESI) //mask for n%8 elements - LEA(RAX, MEM(RAX,RCX,1)) - LEA(R14, MEM(R14,RDI,1)) + LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9,10,11,12,13,14,15) - SUB(RSI, IMM(1)) + VMOVUPD(MEM(R14 ), ZMM( 8)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,RDI,1), ZMM( 9)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,RDI,2), ZMM(10)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,R11,1), ZMM(11)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,RDI,4), ZMM(12)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,R12,1), ZMM(13)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(R14,R11,2), ZMM(14)) - JNZ(PACK8_G_LOOP) + JMP(PACK8_DONE) - LABEL(PACK8_DONE) + LABEL(PACK8_G) + VPBROADCASTD(ZMM(3), VAR(inca)) + MOV(RBX, VAR(offsetPtr)) + VPMULLD(YMM(0), YMM(3), MEM(RBX)) + + LABEL(PACK8_G_LOOP) + + KXNORW(K(1), K(0), K(0)) + VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) + VMULPD(ZMM(3), ZMM(3), ZMM(31)) + VMOVUPD(MEM(R14), ZMM(3)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14,RDI,1)) + + SUB(RSI, IMM(1)) + + JNZ(PACK8_G_LOOP) + + LABEL(PACK8_DONE) + + END_ASM( : //output operands : //input operands [n] "m" (n), @@ -294,7 +297,7 @@ void bli_dpackm_knl_asm_8xk "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" - ); + ) } void bli_dpackm_knl_asm_24xk @@ -441,7 +444,7 @@ void bli_dpackm_knl_asm_24xk MOV(R13, IMM(1)) SHLX(R13, R13, RSI) SUB(R13, IMM(1)) - KMOV(K(1), R13D) //mask for n%8 elements + KMOVW(K(1), R13D) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c index 2a797ab36..961df18ef 100644 --- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c +++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c @@ -32,10 +32,10 @@ */ -#include "bli_avx512_macros.h" #include "blis.h" -#include +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index cfe9d1a32..196372c1a 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -35,7 +35,8 @@ #include "blis.h" #include -#include "bli_avx512_macros.h" +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" #define UNROLL_K 32 @@ -212,8 +213,8 @@ void bli_dgemm_knl_asm_24x8 int tlooph, tloopl, blooph, bloopl; #endif - __asm__ volatile - ( + BEGIN_ASM + #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) @@ -380,7 +381,7 @@ void bli_dgemm_knl_asm_24x8 JNZ(MAIN_LOOP) LABEL(REM_1) - SAR1(RDI) + SAR(RDI) JNC(REM_2) SUBITER(0,1,0,RAX) @@ -389,7 +390,7 @@ void bli_dgemm_knl_asm_24x8 ADD(RBX, IMM( 8*8)) LABEL(REM_2) - SAR1(RDI) + SAR(RDI) JNC(REM_4) SUBITER(0,1,0,RAX) @@ -398,7 +399,7 @@ void bli_dgemm_knl_asm_24x8 ADD(RBX, IMM(2* 8*8)) LABEL(REM_4) - SAR1(RDI) + SAR(RDI) JNC(REM_8) SUBITER(0,1,0,RAX) @@ -409,7 +410,7 @@ void bli_dgemm_knl_asm_24x8 ADD(RBX, IMM(4* 8*8)) LABEL(REM_8) - SAR1(RDI) + SAR(RDI) JNC(REM_16) SUBITER(0,1,0,RAX ) @@ -424,7 +425,7 @@ void bli_dgemm_knl_asm_24x8 ADD(RBX, IMM(8* 8*8)) LABEL(REM_16) - SAR1(RDI) + SAR(RDI) JNC(AFTER_LOOP) SUBITER( 0,1,0,RAX ) @@ -570,7 +571,7 @@ void bli_dgemm_knl_asm_24x8 JNE(SCATTEREDUPDATE) VMOVQ(RDX, XMM(1)) - SAL1(RDX) //shift out sign bit + SAL(RDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) @@ -602,7 +603,7 @@ void bli_dgemm_knl_asm_24x8 VPMULLD(ZMM(2), ZMM(3), ZMM(2)) VMOVQ(RDX, XMM(1)) - SAL1(RDX) //shift out sign bit + SAL(RDX) //shift out sign bit JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 8) @@ -666,6 +667,8 @@ void bli_dgemm_knl_asm_24x8 MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif + + END_ASM( : // output operands #ifdef MONITORS [topl] "=m" (topl), @@ -696,7 +699,7 @@ void bli_dgemm_knl_asm_24x8 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" - ); + ) #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c index 4042c9ab8..41e4b12aa 100644 --- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -35,7 +35,8 @@ #include "blis.h" #include -#include "bli_avx512_macros.h" +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" #define UNROLL_K 32 @@ -377,7 +378,7 @@ void bli_sgemm_knl_asm_24x16 JNZ(MAIN_LOOP) LABEL(REM_1) - SAR1(RDI) + SAR(RDI) JNC(REM_2) SUBITER(0,1,0,RAX) @@ -386,7 +387,7 @@ void bli_sgemm_knl_asm_24x16 ADD(RBX, IMM(16*4)) LABEL(REM_2) - SAR1(RDI) + SAR(RDI) JNC(REM_4) SUBITER(0,1,0,RAX) @@ -395,7 +396,7 @@ void bli_sgemm_knl_asm_24x16 ADD(RBX, IMM(2*16*4)) LABEL(REM_4) - SAR1(RDI) + SAR(RDI) JNC(REM_8) SUBITER(0,1,0,RAX) @@ -406,7 +407,7 @@ void bli_sgemm_knl_asm_24x16 ADD(RBX, IMM(4*16*4)) LABEL(REM_8) - SAR1(RDI) + SAR(RDI) JNC(REM_16) SUBITER(0,1,0,RAX ) @@ -421,7 +422,7 @@ void bli_sgemm_knl_asm_24x16 ADD(RBX, IMM(8*16*4)) LABEL(REM_16) - SAR1(RDI) + SAR(RDI) JNC(AFTER_LOOP) SUBITER( 0,1,0,RAX ) @@ -567,7 +568,7 @@ void bli_sgemm_knl_asm_24x16 JNE(SCATTEREDUPDATE) VMOVD(EDX, XMM(1)) - SAL1(EDX) //shift out sign bit + SAL(EDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) @@ -599,7 +600,7 @@ void bli_sgemm_knl_asm_24x16 VPMULLD(ZMM(2), ZMM(3), ZMM(2)) VMOVD(EDX, XMM(1)) - SAL1(EDX) //shift out sign bit + SAL(EDX) //shift out sign bit JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 8) diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c index 09c375810..98486a2e8 100644 --- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c @@ -34,6 +34,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + void bli_sgemm_penryn_asm_8x4 ( dim_t k0, @@ -58,765 +61,765 @@ void bli_sgemm_penryn_asm_8x4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r9 \n\t" // load address of b_next. - " \n\t" - "subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte - "subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. - " \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements - "movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. - "movaps -8 * 16(%%rbx), %%xmm2 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) - "movq %%rdi, %%r12 \n\t" // make a copy of cs_c (in bytes) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next - " \n\t" - "xorps %%xmm3, %%xmm3 \n\t" - "xorps %%xmm4, %%xmm4 \n\t" - "xorps %%xmm5, %%xmm5 \n\t" - "xorps %%xmm6, %%xmm6 \n\t" - " \n\t" - "prefetcht2 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c - "xorps %%xmm8, %%xmm8 \n\t" - "xorps %%xmm9, %%xmm9 \n\t" - "prefetcht2 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "xorps %%xmm10, %%xmm10 \n\t" - "xorps %%xmm11, %%xmm11 \n\t" - "prefetcht2 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c - "xorps %%xmm12, %%xmm12 \n\t" - "xorps %%xmm13, %%xmm13 \n\t" - "prefetcht2 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - "xorps %%xmm14, %%xmm14 \n\t" - "xorps %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "prefetcht0 (4*35+1) * 8(%%rax) \n\t" - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" // iteration 0 - "addps %%xmm3, %%xmm14 \n\t" - "movaps %%xmm2, %%xmm3 \n\t" - "pshufd $0x39, %%xmm2, %%xmm7 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - "movaps %%xmm7, %%xmm5 \n\t" - "pshufd $0x39, %%xmm7, %%xmm6 \n\t" - "mulps %%xmm0, %%xmm7 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - " \n\t" - "addps %%xmm2, %%xmm8 \n\t" - "movaps -7 * 16(%%rbx), %%xmm2 \n\t" - "addps %%xmm3, %%xmm12 \n\t" - "movaps %%xmm6, %%xmm3 \n\t" - "pshufd $0x39, %%xmm6, %%xmm4 \n\t" - "mulps %%xmm0, %%xmm6 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm7, %%xmm9 \n\t" - "addps %%xmm5, %%xmm13 \n\t" - "movaps %%xmm4, %%xmm5 \n\t" - "mulps %%xmm0, %%xmm4 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" // iteration 1 - "addps %%xmm3, %%xmm14 \n\t" - "movaps %%xmm2, %%xmm3 \n\t" - "pshufd $0x39, %%xmm2, %%xmm7 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - "movaps %%xmm7, %%xmm5 \n\t" - "pshufd $0x39, %%xmm7, %%xmm6 \n\t" - "mulps %%xmm0, %%xmm7 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - " \n\t" - "addps %%xmm2, %%xmm8 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addps %%xmm3, %%xmm12 \n\t" - "movaps %%xmm6, %%xmm3 \n\t" - "pshufd $0x39, %%xmm6, %%xmm4 \n\t" - "mulps %%xmm0, %%xmm6 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm7, %%xmm9 \n\t" - "addps %%xmm5, %%xmm13 \n\t" - "movaps %%xmm4, %%xmm5 \n\t" - "mulps %%xmm0, %%xmm4 \n\t" - "movaps -4 * 16(%%rax), %%xmm0 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - "movaps -3 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" // iteration 2 - "addps %%xmm3, %%xmm14 \n\t" - "movaps %%xmm2, %%xmm3 \n\t" - "pshufd $0x39, %%xmm2, %%xmm7 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - "movaps %%xmm7, %%xmm5 \n\t" - "pshufd $0x39, %%xmm7, %%xmm6 \n\t" - "mulps %%xmm0, %%xmm7 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - " \n\t" - "addps %%xmm2, %%xmm8 \n\t" - "movaps -5 * 16(%%rbx), %%xmm2 \n\t" - "addps %%xmm3, %%xmm12 \n\t" - "movaps %%xmm6, %%xmm3 \n\t" - "pshufd $0x39, %%xmm6, %%xmm4 \n\t" - "mulps %%xmm0, %%xmm6 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm7, %%xmm9 \n\t" - "addps %%xmm5, %%xmm13 \n\t" - "movaps %%xmm4, %%xmm5 \n\t" - "mulps %%xmm0, %%xmm4 \n\t" - "movaps -2 * 16(%%rax), %%xmm0 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - "movaps -1 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" // iteration 3 - "addps %%xmm3, %%xmm14 \n\t" - "movaps %%xmm2, %%xmm3 \n\t" - "pshufd $0x39, %%xmm2, %%xmm7 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "subq $-4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr) - " \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - "movaps %%xmm7, %%xmm5 \n\t" - "pshufd $0x39, %%xmm7, %%xmm6 \n\t" - "mulps %%xmm0, %%xmm7 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - " \n\t" - "subq $-4 * 4 * 4, %%r9 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - "addps %%xmm2, %%xmm8 \n\t" - "movaps -4 * 16(%%rbx), %%xmm2 \n\t" - "addps %%xmm3, %%xmm12 \n\t" - "movaps %%xmm6, %%xmm3 \n\t" - "pshufd $0x39, %%xmm6, %%xmm4 \n\t" - "mulps %%xmm0, %%xmm6 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "subq $-4 * 4 * 4, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "addps %%xmm7, %%xmm9 \n\t" - "addps %%xmm5, %%xmm13 \n\t" - "movaps %%xmm4, %%xmm5 \n\t" - "mulps %%xmm0, %%xmm4 \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - "movaps -7 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - "prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next[0] - "prefetcht2 16 * 4(%%r9) \n\t" // prefetch b_next[16] - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" // iteration 0 - "addps %%xmm3, %%xmm14 \n\t" - "movaps %%xmm2, %%xmm3 \n\t" - "pshufd $0x39, %%xmm2, %%xmm7 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - "movaps %%xmm7, %%xmm5 \n\t" - "pshufd $0x39, %%xmm7, %%xmm6 \n\t" - "mulps %%xmm0, %%xmm7 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - " \n\t" - "addps %%xmm2, %%xmm8 \n\t" - "movaps -7 * 16(%%rbx), %%xmm2 \n\t" - "addps %%xmm3, %%xmm12 \n\t" - "movaps %%xmm6, %%xmm3 \n\t" - "pshufd $0x39, %%xmm6, %%xmm4 \n\t" - "mulps %%xmm0, %%xmm6 \n\t" - "mulps %%xmm1, %%xmm3 \n\t" - " \n\t" - "addps %%xmm7, %%xmm9 \n\t" - "addps %%xmm5, %%xmm13 \n\t" - "movaps %%xmm4, %%xmm5 \n\t" - "mulps %%xmm0, %%xmm4 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulps %%xmm1, %%xmm5 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - "subq $-1 * 8 * 4, %%rax \n\t" // a += 8 (1 x mr) - "subq $-1 * 4 * 4, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - "addps %%xmm6, %%xmm10 \n\t" - "addps %%xmm3, %%xmm14 \n\t" - "addps %%xmm4, %%xmm11 \n\t" - "addps %%xmm5, %%xmm15 \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "movss (%%rax), %%xmm6 \n\t" // load alpha to bottom 4 bytes of xmm6 - "movss (%%rbx), %%xmm7 \n\t" // load beta to bottom 4 bytes of xmm7 - "pshufd $0x00, %%xmm6, %%xmm6 \n\t" // populate xmm6 with four alphas - "pshufd $0x00, %%xmm7, %%xmm7 \n\t" // populate xmm7 with four betas - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "movq %%rsi, %%r8 \n\t" // make a copy of rs_c - " \n\t" - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) - "leaq (%%rsi,%%rsi,2), %%r11 \n\t" // r11 = 3*(rs_c * sizeof(float)) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab11 ab12 ab13 ab10 - " \n\t" // ab22 ab23 ab20 ab21 - " \n\t" // ab33 ) ab30 ) ab31 ) ab32 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab51 ab52 ab53 ab50 - " \n\t" // ab62 ab63 ab60 ab61 - " \n\t" // ab73 ) ab70 ) ab71 ) ab72 ) - "movaps %%xmm9, %%xmm4 \n\t" - "shufps $0xd8, %%xmm8, %%xmm9 \n\t" - "shufps $0xd8, %%xmm11, %%xmm8 \n\t" - "shufps $0xd8, %%xmm10, %%xmm11\n\t" - "shufps $0xd8, %%xmm4, %%xmm10\n\t" - " \n\t" - "movaps %%xmm8, %%xmm4 \n\t" - "shufps $0xd8, %%xmm10, %%xmm8 \n\t" - "shufps $0xd8, %%xmm4, %%xmm10 \n\t" - "movaps %%xmm9, %%xmm5 \n\t" - "shufps $0xd8, %%xmm11, %%xmm9 \n\t" - "shufps $0xd8, %%xmm5, %%xmm11 \n\t" - " \n\t" - "movaps %%xmm13, %%xmm4 \n\t" - "shufps $0xd8, %%xmm12, %%xmm13\n\t" - "shufps $0xd8, %%xmm15, %%xmm12\n\t" - "shufps $0xd8, %%xmm14, %%xmm15\n\t" - "shufps $0xd8, %%xmm4, %%xmm14\n\t" - " \n\t" - "movaps %%xmm12, %%xmm4 \n\t" - "shufps $0xd8, %%xmm14, %%xmm12\n\t" - "shufps $0xd8, %%xmm4, %%xmm14 \n\t" - "movaps %%xmm13, %%xmm5 \n\t" - "shufps $0xd8, %%xmm15, %%xmm13\n\t" - "shufps $0xd8, %%xmm5, %%xmm15 \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 16 == 0, AND - " \n\t" // 8*cs_c % 16 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $15, %%rcx \n\t" // set ZF if c & 16 is zero. - "setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 ); - "testq $15, %%r12 \n\t" // set ZF if (4*cs_c) & 16 is zero. - "setz %%al \n\t" // al = ( ZF == 1 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" - "movlps (%%rcx ), %%xmm0 \n\t" // load c00 ~ c30 - "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm8, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "movlps (%%rdx ), %%xmm0 \n\t" // load c40 ~ c70 - "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm12, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "movlps (%%rcx ), %%xmm0 \n\t" // load c01 ~ c31 - "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm9, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "movlps (%%rdx ), %%xmm0 \n\t" // load c41 ~ c71 - "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm13, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "movlps (%%rcx ), %%xmm0 \n\t" // load c02 ~ c32 - "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm10, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "movlps (%%rdx ), %%xmm0 \n\t" // load c42 ~ c72 - "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm14, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "movlps (%%rcx ), %%xmm0 \n\t" // load c03 ~ c33 - "movhps (%%rcx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rcx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rcx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm11, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movlps (%%rdx ), %%xmm0 \n\t" // load c43 ~ c73 - "movhps (%%rdx,%%rsi,1), %%xmm0 \n\t" - "movlps (%%rdx,%%rsi,2), %%xmm1 \n\t" - "movhps (%%rdx,%%r11 ), %%xmm1 \n\t" - "shufps $0x88, %%xmm1, %%xmm0 \n\t" - " \n\t" - "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm15, %%xmm0 \n\t" // add the gemm result, - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c00 ~ c30, - "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm8, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c40 ~ c70, - "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, - "addps %%xmm12, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c01 ~ c31, - "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm9, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c41 ~ c71, - "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, - "addps %%xmm13, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c02 ~ c32, - "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm10, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c42 ~ c72, - "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, - "addps %%xmm14, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c03 ~ c33, - "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm0 \n\t" // scale by beta, - "addps %%xmm11, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c43 ~ c73, - "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, - "mulps %%xmm7, %%xmm1 \n\t" // scale by beta, - "addps %%xmm15, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" - "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, - "movaps %%xmm8, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, - "movaps %%xmm12, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, - "movaps %%xmm9, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, - "movaps %%xmm13, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, - "movaps %%xmm10, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, - "movaps %%xmm14, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, - "movaps %%xmm11, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rcx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rcx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rcx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rcx,%%r11 ) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, - "movaps %%xmm15, %%xmm0 \n\t" - " \n\t" - "movss %%xmm0, (%%rdx ) \n\t" // and store back to memory. - "pshufd $0x39, %%xmm0, %%xmm1 \n\t" - "movss %%xmm1, (%%rdx,%%rsi,1) \n\t" - "pshufd $0x39, %%xmm1, %%xmm2 \n\t" - "movss %%xmm2, (%%rdx,%%rsi,2) \n\t" - "pshufd $0x39, %%xmm2, %%xmm3 \n\t" - "movss %%xmm3, (%%rdx,%%r11 ) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - " \n\t" // skip loading c00 ~ c30, - "mulps %%xmm6, %%xmm8 \n\t" // scale by alpha, - "movaps %%xmm8, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c40 ~ c70, - "mulps %%xmm6, %%xmm12 \n\t" // scale by alpha, - "movaps %%xmm12, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c01 ~ c31, - "mulps %%xmm6, %%xmm9 \n\t" // scale by alpha, - "movaps %%xmm9, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c41 ~ c71, - "mulps %%xmm6, %%xmm13 \n\t" // scale by alpha, - "movaps %%xmm13, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c02 ~ c32, - "mulps %%xmm6, %%xmm10 \n\t" // scale by alpha, - "movaps %%xmm10, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c42 ~ c72, - "mulps %%xmm6, %%xmm14 \n\t" // scale by alpha, - "movaps %%xmm14, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c03 ~ c33, - "mulps %%xmm6, %%xmm11 \n\t" // scale by alpha, - "movaps %%xmm11, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" // skip loading c43 ~ c73, - "mulps %%xmm6, %%xmm15 \n\t" // scale by alpha, - "movaps %%xmm15, (%%rdx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r9) // load address of b_next. + + sub(imm(0-8*16), rax) // increment pointers to allow byte + sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. + + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements + movaps(mem(rax, -7*16), xmm1) // of a and b. + movaps(mem(rbx, -8*16), xmm2) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) + mov(rdi, r12) // make a copy of cs_c (in bytes) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(2, mem(r9, 0*4)) // prefetch b_next + + xorps(xmm3, xmm3) + xorps(xmm4, xmm4) + xorps(xmm5, xmm5) + xorps(xmm6, xmm6) + + prefetch(2, mem(rcx, 6*4)) // prefetch c + 0*cs_c + xorps(xmm8, xmm8) + xorps(xmm9, xmm9) + prefetch(2, mem(rcx, rdi, 1, 6*4)) // prefetch c + 1*cs_c + xorps(xmm10, xmm10) + xorps(xmm11, xmm11) + prefetch(2, mem(r10, 6*4)) // prefetch c + 2*cs_c + xorps(xmm12, xmm12) + xorps(xmm13, xmm13) + prefetch(2, mem(r10, rdi, 1, 6*4)) // prefetch c + 3*cs_c + xorps(xmm14, xmm14) + xorps(xmm15, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + prefetch(0, mem(4*35+1)*8(rax)) + + addps(xmm6, xmm10) // iteration 0 + addps(xmm3, xmm14) + movaps(xmm2, xmm3) + pshufd(imm(0x39), xmm2, xmm7) + mulps(xmm0, xmm2) + mulps(xmm1, xmm3) + + addps(xmm4, xmm11) + addps(xmm5, xmm15) + movaps(xmm7, xmm5) + pshufd(imm(0x39), xmm7, xmm6) + mulps(xmm0, xmm7) + mulps(xmm1, xmm5) + + addps(xmm2, xmm8) + movaps(mem(rbx, -7*16), xmm2) + addps(xmm3, xmm12) + movaps(xmm6, xmm3) + pshufd(imm(0x39), xmm6, xmm4) + mulps(xmm0, xmm6) + mulps(xmm1, xmm3) + + addps(xmm7, xmm9) + addps(xmm5, xmm13) + movaps(xmm4, xmm5) + mulps(xmm0, xmm4) + movaps(mem(rax, -6*16), xmm0) + mulps(xmm1, xmm5) + movaps(mem(rax, -5*16), xmm1) + + + addps(xmm6, xmm10) // iteration 1 + addps(xmm3, xmm14) + movaps(xmm2, xmm3) + pshufd(imm(0x39), xmm2, xmm7) + mulps(xmm0, xmm2) + mulps(xmm1, xmm3) + + addps(xmm4, xmm11) + addps(xmm5, xmm15) + movaps(xmm7, xmm5) + pshufd(imm(0x39), xmm7, xmm6) + mulps(xmm0, xmm7) + mulps(xmm1, xmm5) + + addps(xmm2, xmm8) + movaps(mem(rbx, -6*16), xmm2) + addps(xmm3, xmm12) + movaps(xmm6, xmm3) + pshufd(imm(0x39), xmm6, xmm4) + mulps(xmm0, xmm6) + mulps(xmm1, xmm3) + + addps(xmm7, xmm9) + addps(xmm5, xmm13) + movaps(xmm4, xmm5) + mulps(xmm0, xmm4) + movaps(mem(rax, -4*16), xmm0) + mulps(xmm1, xmm5) + movaps(mem(rax, -3*16), xmm1) + + + addps(xmm6, xmm10) // iteration 2 + addps(xmm3, xmm14) + movaps(xmm2, xmm3) + pshufd(imm(0x39), xmm2, xmm7) + mulps(xmm0, xmm2) + mulps(xmm1, xmm3) + + addps(xmm4, xmm11) + addps(xmm5, xmm15) + movaps(xmm7, xmm5) + pshufd(imm(0x39), xmm7, xmm6) + mulps(xmm0, xmm7) + mulps(xmm1, xmm5) + + addps(xmm2, xmm8) + movaps(mem(rbx, -5*16), xmm2) + addps(xmm3, xmm12) + movaps(xmm6, xmm3) + pshufd(imm(0x39), xmm6, xmm4) + mulps(xmm0, xmm6) + mulps(xmm1, xmm3) + + addps(xmm7, xmm9) + addps(xmm5, xmm13) + movaps(xmm4, xmm5) + mulps(xmm0, xmm4) + movaps(mem(rax, -2*16), xmm0) + mulps(xmm1, xmm5) + movaps(mem(rax, -1*16), xmm1) + + + addps(xmm6, xmm10) // iteration 3 + addps(xmm3, xmm14) + movaps(xmm2, xmm3) + pshufd(imm(0x39), xmm2, xmm7) + mulps(xmm0, xmm2) + mulps(xmm1, xmm3) + + sub(imm(0-4*8*4), rax) // a += 4*8 (unroll x mr) + + addps(xmm4, xmm11) + addps(xmm5, xmm15) + movaps(xmm7, xmm5) + pshufd(imm(0x39), xmm7, xmm6) + mulps(xmm0, xmm7) + mulps(xmm1, xmm5) + + sub(imm(0-4*4*4), r9) // b_next += 4*4 (unroll x nr) + + addps(xmm2, xmm8) + movaps(mem(rbx, -4*16), xmm2) + addps(xmm3, xmm12) + movaps(xmm6, xmm3) + pshufd(imm(0x39), xmm6, xmm4) + mulps(xmm0, xmm6) + mulps(xmm1, xmm3) + + sub(imm(0-4*4*4), rbx) // b += 4*4 (unroll x nr) + + addps(xmm7, xmm9) + addps(xmm5, xmm13) + movaps(xmm4, xmm5) + mulps(xmm0, xmm4) + movaps(mem(rax, -8*16), xmm0) + mulps(xmm1, xmm5) + movaps(mem(rax, -7*16), xmm1) + + prefetch(2, mem(r9, 0*4)) // prefetch b_next[0] + prefetch(2, mem(r9, 16*4)) // prefetch b_next[16] + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + addps(xmm6, xmm10) // iteration 0 + addps(xmm3, xmm14) + movaps(xmm2, xmm3) + pshufd(imm(0x39), xmm2, xmm7) + mulps(xmm0, xmm2) + mulps(xmm1, xmm3) + + addps(xmm4, xmm11) + addps(xmm5, xmm15) + movaps(xmm7, xmm5) + pshufd(imm(0x39), xmm7, xmm6) + mulps(xmm0, xmm7) + mulps(xmm1, xmm5) + + addps(xmm2, xmm8) + movaps(mem(rbx, -7*16), xmm2) + addps(xmm3, xmm12) + movaps(xmm6, xmm3) + pshufd(imm(0x39), xmm6, xmm4) + mulps(xmm0, xmm6) + mulps(xmm1, xmm3) + + addps(xmm7, xmm9) + addps(xmm5, xmm13) + movaps(xmm4, xmm5) + mulps(xmm0, xmm4) + movaps(mem(rax, -6*16), xmm0) + mulps(xmm1, xmm5) + movaps(mem(rax, -5*16), xmm1) + + sub(imm(0-1*8*4), rax) // a += 8 (1 x mr) + sub(imm(0-1*4*4), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + addps(xmm6, xmm10) + addps(xmm3, xmm14) + addps(xmm4, xmm11) + addps(xmm5, xmm15) + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6 + movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7 + pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas + pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas + + + mov(%7, rsi) // load rs_c + mov(rsi, r8) // make a copy of rs_c + + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) + lea(mem(rsi, rsi, 2), r11) // r11 = 3*(rs_c * sizeof(float)) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + // xmm8: xmm9: xmm10: xmm11: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab11 ab12 ab13 ab10 + // ab22 ab23 ab20 ab21 + // ab33 ) ab30 ) ab31 ) ab32 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab51 ab52 ab53 ab50 + // ab62 ab63 ab60 ab61 + // ab73 ) ab70 ) ab71 ) ab72 ) + movaps(xmm9, xmm4) + shufps(imm(0xd8), xmm8, xmm9) + shufps(imm(0xd8), xmm11, xmm8) + shufps(imm(0xd8), xmm10, xmm11) + shufps(imm(0xd8), xmm4, xmm10) + + movaps(xmm8, xmm4) + shufps(imm(0xd8), xmm10, xmm8) + shufps(imm(0xd8), xmm4, xmm10) + movaps(xmm9, xmm5) + shufps(imm(0xd8), xmm11, xmm9) + shufps(imm(0xd8), xmm5, xmm11) + + movaps(xmm13, xmm4) + shufps(imm(0xd8), xmm12, xmm13) + shufps(imm(0xd8), xmm15, xmm12) + shufps(imm(0xd8), xmm14, xmm15) + shufps(imm(0xd8), xmm4, xmm14) + + movaps(xmm12, xmm4) + shufps(imm(0xd8), xmm14, xmm12) + shufps(imm(0xd8), xmm4, xmm14) + movaps(xmm13, xmm5) + shufps(imm(0xd8), xmm15, xmm13) + shufps(imm(0xd8), xmm5, xmm15) + // xmm8: xmm9: xmm10: xmm11: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ) ab31 ) ab32 ) ab33 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + + + // determine if + // c % 16 == 0, AND + // 8*cs_c % 16 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(1), r8) // set ZF if rs_c == 1. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(15), rcx) // set ZF if c & 16 is zero. + setz(bh) // bh = ( ZF == 1 ? 1 : 0 ); + test(imm(15), r12) // set ZF if (4*cs_c) & 16 is zero. + setz(al) // al = ( ZF == 1 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + xorpd(xmm0, xmm0) // set xmm0 to zero. + ucomisd(xmm0, xmm7) // check if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORED) // jump to column storage case + + + + label(.SGENSTORED) + + movlps(mem(rcx), xmm0) // load c00 ~ c30 + movhps(mem(rcx, rsi, 1), xmm0) + movlps(mem(rcx, rsi, 2), xmm1) + movhps(mem(rcx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm8) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm8, xmm0) // add the gemm result, + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + movlps(mem(rdx), xmm0) // load c40 ~ c70 + movhps(mem(rdx, rsi, 1), xmm0) + movlps(mem(rdx, rsi, 2), xmm1) + movhps(mem(rdx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm12) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm12, xmm0) // add the gemm result, + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + movlps(mem(rcx), xmm0) // load c01 ~ c31 + movhps(mem(rcx, rsi, 1), xmm0) + movlps(mem(rcx, rsi, 2), xmm1) + movhps(mem(rcx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm9) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm9, xmm0) // add the gemm result, + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + movlps(mem(rdx), xmm0) // load c41 ~ c71 + movhps(mem(rdx, rsi, 1), xmm0) + movlps(mem(rdx, rsi, 2), xmm1) + movhps(mem(rdx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm13) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm13, xmm0) // add the gemm result, + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + movlps(mem(rcx), xmm0) // load c02 ~ c32 + movhps(mem(rcx, rsi, 1), xmm0) + movlps(mem(rcx, rsi, 2), xmm1) + movhps(mem(rcx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm10) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm10, xmm0) // add the gemm result, + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + movlps(mem(rdx), xmm0) // load c42 ~ c72 + movhps(mem(rdx, rsi, 1), xmm0) + movlps(mem(rdx, rsi, 2), xmm1) + movhps(mem(rdx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm14) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm14, xmm0) // add the gemm result, + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + movlps(mem(rcx), xmm0) // load c03 ~ c33 + movhps(mem(rcx, rsi, 1), xmm0) + movlps(mem(rcx, rsi, 2), xmm1) + movhps(mem(rcx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm11) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm11, xmm0) // add the gemm result, + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + + + + movlps(mem(rdx), xmm0) // load c43 ~ c73 + movhps(mem(rdx, rsi, 1), xmm0) + movlps(mem(rdx, rsi, 2), xmm1) + movhps(mem(rdx, r11, 1), xmm1) + shufps(imm(0x88), xmm1, xmm0) + + mulps(xmm6, xmm15) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm15, xmm0) // add the gemm result, + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + movaps(mem(rcx), xmm0) // load c00 ~ c30, + mulps(xmm6, xmm8) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm8, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c40 ~ c70, + mulps(xmm6, xmm12) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm12, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c01 ~ c31, + mulps(xmm6, xmm9) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm9, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c41 ~ c71, + mulps(xmm6, xmm13) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm13, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c02 ~ c32, + mulps(xmm6, xmm10) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm10, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c42 ~ c72, + mulps(xmm6, xmm14) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm14, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c03 ~ c33, + mulps(xmm6, xmm11) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm11, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + + + movaps(mem(rdx), xmm1) // load c43 ~ c73, + mulps(xmm6, xmm15) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm15, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + + jmp(.SDONE) // jump to end. + + + + + label(.SBETAZERO) + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORBZ) // jump to column storage case + + + + label(.SGENSTORBZ) + + mulps(xmm6, xmm8) // scale by alpha, + movaps(xmm8, xmm0) + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + mulps(xmm6, xmm12) // scale by alpha, + movaps(xmm12, xmm0) + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + mulps(xmm6, xmm9) // scale by alpha, + movaps(xmm9, xmm0) + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + mulps(xmm6, xmm13) // scale by alpha, + movaps(xmm13, xmm0) + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + mulps(xmm6, xmm10) // scale by alpha, + movaps(xmm10, xmm0) + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + add(rdi, rcx) + + + mulps(xmm6, xmm14) // scale by alpha, + movaps(xmm14, xmm0) + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + add(rdi, rdx) + + + mulps(xmm6, xmm11) // scale by alpha, + movaps(xmm11, xmm0) + + movss(xmm0, mem(rcx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rcx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rcx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rcx, r11, 1)) + + + + + mulps(xmm6, xmm15) // scale by alpha, + movaps(xmm15, xmm0) + + movss(xmm0, mem(rdx)) // and store back to memory. + pshufd(imm(0x39), xmm0, xmm1) + movss(xmm1, mem(rdx, rsi, 1)) + pshufd(imm(0x39), xmm1, xmm2) + movss(xmm2, mem(rdx, rsi, 2)) + pshufd(imm(0x39), xmm2, xmm3) + movss(xmm3, mem(rdx, r11, 1)) + + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORBZ) + + // skip loading c00 ~ c30, + mulps(xmm6, xmm8) // scale by alpha, + movaps(xmm8, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c40 ~ c70, + mulps(xmm6, xmm12) // scale by alpha, + movaps(xmm12, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c01 ~ c31, + mulps(xmm6, xmm9) // scale by alpha, + movaps(xmm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c41 ~ c71, + mulps(xmm6, xmm13) // scale by alpha, + movaps(xmm13, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c02 ~ c32, + mulps(xmm6, xmm10) // scale by alpha, + movaps(xmm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c42 ~ c72, + mulps(xmm6, xmm14) // scale by alpha, + movaps(xmm14, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c03 ~ c33, + mulps(xmm6, xmm11) // scale by alpha, + movaps(xmm11, mem(rcx)) // and store back to memory. + + // skip loading c43 ~ c73, + mulps(xmm6, xmm15) // scale by alpha, + movaps(xmm15, mem(rdx)) // and store back to memory. + + + + + + + + + label(.SDONE) + : // output operands (none) : // input operands @@ -864,605 +867,605 @@ void bli_dgemm_penryn_asm_4x4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r9 \n\t" // load address of b_next. - "movq %10, %%r11 \n\t" // load address of a_next. - " \n\t" - "subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte - "subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. - " \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements - "movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. - "movaps -8 * 16(%%rbx), %%xmm2 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - "movq %%rdi, %%r12 \n\t" // make a copy of cs_c (in bytes) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next - " \n\t" - "xorpd %%xmm3, %%xmm3 \n\t" - "xorpd %%xmm4, %%xmm4 \n\t" - "xorpd %%xmm5, %%xmm5 \n\t" - "xorpd %%xmm6, %%xmm6 \n\t" - " \n\t" - "prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "xorpd %%xmm8, %%xmm8 \n\t" - "xorpd %%xmm9, %%xmm9 \n\t" - "prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "xorpd %%xmm10, %%xmm10 \n\t" - "xorpd %%xmm11, %%xmm11 \n\t" - "prefetcht2 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "xorpd %%xmm12, %%xmm12 \n\t" - "xorpd %%xmm13, %%xmm13 \n\t" - "prefetcht2 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - "xorpd %%xmm14, %%xmm14 \n\t" - "xorpd %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "prefetcht0 (4*35+1) * 8(%%rax) \n\t" - //"prefetcht0 (8*97+4) * 8(%%rax) \n\t" - " \n\t" - //"prefetcht0 67*4 * 8(%%r11) \n\t" // prefetch a_next[0] - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 1 - "movaps -5 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -4 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -4 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -3 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "prefetcht0 (4*37+1) * 8(%%rax) \n\t" - //"prefetcht0 (8*97+12)* 8(%%rax) \n\t" - " \n\t" - //"prefetcht0 69*4 * 8(%%r11) \n\t" // prefetch a_next[8] - //"subq $-4 * 4 * 8, %%r11 \n\t" // a_next += 4*4 (unroll x mr) - " \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 2 - "movaps -3 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -2 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -2 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -1 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 3 - "movaps -1 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "subq $-4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "subq $-4 * 4 * 8, %%r9 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps 0 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "subq $-4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -7 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - "prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next[0] - "prefetcht2 8 * 8(%%r9) \n\t" // prefetch b_next[8] - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - //"prefetcht2 -8 * 8(%%r9) \n\t" // prefetch b_next[-8] - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "subq $-4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) - "subq $-4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "movddup (%%rax), %%xmm6 \n\t" // load alpha and duplicate - "movddup (%%rbx), %%xmm7 \n\t" // load beta and duplicate - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "movq %%rsi, %%r8 \n\t" // make a copy of rs_c - " \n\t" - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; - " \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 - " \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab21 ( ab20 ( ab23 ( ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - "movaps %%xmm8, %%xmm0 \n\t" - "movsd %%xmm9, %%xmm8 \n\t" - "movsd %%xmm0, %%xmm9 \n\t" - " \n\t" - "movaps %%xmm10, %%xmm0 \n\t" - "movsd %%xmm11, %%xmm10 \n\t" - "movsd %%xmm0, %%xmm11 \n\t" - " \n\t" - "movaps %%xmm12, %%xmm0 \n\t" - "movsd %%xmm13, %%xmm12 \n\t" - "movsd %%xmm0, %%xmm13 \n\t" - " \n\t" - "movaps %%xmm14, %%xmm0 \n\t" - "movsd %%xmm15, %%xmm14 \n\t" - "movsd %%xmm0, %%xmm15 \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab20 ( ab21 ( ab22 ( ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 16 == 0, AND - " \n\t" // 8*cs_c % 16 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $15, %%rcx \n\t" // set ZF if c & 16 is zero. - "setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 ); - "testq $15, %%r12 \n\t" // set ZF if (8*cs_c) & 16 is zero. - "setz %%al \n\t" // al = ( ZF == 1 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - "movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10, - "movhpd (%%rcx,%%rsi), %%xmm0 \n\t" - "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm8, %%xmm0 \n\t" // add the gemm result, - "movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movlpd (%%rdx), %%xmm1 \n\t" // load c20 and c30, - "movhpd (%%rdx,%%rsi), %%xmm1 \n\t" - "mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm12, %%xmm1 \n\t" // add the gemm result, - "movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm1, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movlpd (%%rcx), %%xmm0 \n\t" // load c01 and c11, - "movhpd (%%rcx,%%rsi), %%xmm0 \n\t" - "mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm9, %%xmm0 \n\t" // add the gemm result, - "movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movlpd (%%rdx), %%xmm1 \n\t" // load c21 and c31, - "movhpd (%%rdx,%%rsi), %%xmm1 \n\t" - "mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm13, %%xmm1 \n\t" // add the gemm result, - "movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm1, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movlpd (%%rcx), %%xmm0 \n\t" // load c02 and c12, - "movhpd (%%rcx,%%rsi), %%xmm0 \n\t" - "mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm10, %%xmm0 \n\t" // add the gemm result, - "movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movlpd (%%rdx), %%xmm1 \n\t" // load c22 and c32, - "movhpd (%%rdx,%%rsi), %%xmm1 \n\t" - "mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm14, %%xmm1 \n\t" // add the gemm result, - "movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm1, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movlpd (%%rcx), %%xmm0 \n\t" // load c03 and c13, - "movhpd (%%rcx,%%rsi), %%xmm0 \n\t" - "mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm11, %%xmm0 \n\t" // add the gemm result, - "movlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm0, (%%rcx,%%rsi) \n\t" - " \n\t" - " \n\t" - "movlpd (%%rdx), %%xmm1 \n\t" // load c23 and c33, - "movhpd (%%rdx,%%rsi), %%xmm1 \n\t" - "mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm15, %%xmm1 \n\t" // add the gemm result, - "movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm1, (%%rdx,%%rsi) \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c00 and c10, - "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm8, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c20 and c30, - "mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm12, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c01 and c11, - "mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm9, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c21 and c31, - "mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm13, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c02 and c12, - "mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm10, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c22 and c32, - "mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm14, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "movaps (%%rcx), %%xmm0 \n\t" // load c03 and c13, - "mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm0 \n\t" // scale by beta, - "addpd %%xmm11, %%xmm0 \n\t" // add the gemm result, - "movaps %%xmm0, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - "movaps (%%rdx), %%xmm1 \n\t" // load c23 and c33, - "mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha, - "mulpd %%xmm7, %%xmm1 \n\t" // scale by beta, - "addpd %%xmm15, %%xmm1 \n\t" // add the gemm result, - "movaps %%xmm1, (%%rdx) \n\t" // and store back to memory. - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" // skip loading c00 and c10, - "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, - "movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm8, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c20 and c30, - "mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha, - "movlpd %%xmm12, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm12, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c01 and c11, - "mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha, - "movlpd %%xmm9, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm9, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c21 and c31, - "mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha, - "movlpd %%xmm13, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm13, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c02 and c12, - "mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha, - "movlpd %%xmm10, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm10, (%%rcx,%%rsi) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c22 and c32, - "mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha, - "movlpd %%xmm14, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm14, (%%rdx,%%rsi) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c03 and c13, - "mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha, - "movlpd %%xmm11, (%%rcx) \n\t" // and store back to memory. - "movhpd %%xmm11, (%%rcx,%%rsi) \n\t" - " \n\t" - " \n\t" // skip loading c23 and c33, - "mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha, - "movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory. - "movhpd %%xmm15, (%%rdx,%%rsi) \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" // skip loading c00 and c10, - "mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha, - "movaps %%xmm8, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c20 and c30, - "mulpd %%xmm6, %%xmm12 \n\t" // scale by alpha, - "movaps %%xmm12, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c01 and c11, - "mulpd %%xmm6, %%xmm9 \n\t" // scale by alpha, - "movaps %%xmm9, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c21 and c31, - "mulpd %%xmm6, %%xmm13 \n\t" // scale by alpha, - "movaps %%xmm13, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c02 and c12, - "mulpd %%xmm6, %%xmm10 \n\t" // scale by alpha, - "movaps %%xmm10, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" - " \n\t" // skip loading c22 and c32, - "mulpd %%xmm6, %%xmm14 \n\t" // scale by alpha, - "movaps %%xmm14, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // skip loading c03 and c13, - "mulpd %%xmm6, %%xmm11 \n\t" // scale by alpha, - "movaps %%xmm11, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" // skip loading c23 and c33, - "mulpd %%xmm6, %%xmm15 \n\t" // scale by alpha, - "movaps %%xmm15, (%%rdx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r9) // load address of b_next. + mov(%10, r11) // load address of a_next. + + sub(imm(0-8*16), rax) // increment pointers to allow byte + sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. + + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements + movaps(mem(rax, -7*16), xmm1) // of a and b. + movaps(mem(rbx, -8*16), xmm2) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + mov(rdi, r12) // make a copy of cs_c (in bytes) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(2, mem(r9, 0*8)) // prefetch b_next + + xorpd(xmm3, xmm3) + xorpd(xmm4, xmm4) + xorpd(xmm5, xmm5) + xorpd(xmm6, xmm6) + + prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c + xorpd(xmm8, xmm8) + xorpd(xmm9, xmm9) + prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + xorpd(xmm10, xmm10) + xorpd(xmm11, xmm11) + prefetch(2, mem(r10, 3*8)) // prefetch c + 2*cs_c + xorpd(xmm12, xmm12) + xorpd(xmm13, xmm13) + prefetch(2, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + xorpd(xmm14, xmm14) + xorpd(xmm15, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + prefetch(0, mem(4*35+1)*8(rax)) + //prefetch(0, mem(8*97+4)*8(rax)) + + //prefetch(0, mem(r11, 67*4*8)) // prefetch a_next[0] + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + + addpd(xmm3, xmm11) // iteration 1 + movaps(mem(rbx, -5*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -4*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -4*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -3*16), xmm1) + + + prefetch(0, mem(4*37+1)*8(rax)) + //prefetch(0, mem(8*97+12)*8(rax)) + + //prefetch(0, mem(r11, 69*4*8)) // prefetch a_next[8] + //sub(imm(-4*4*8), r11) // a_next += 4*4 (unroll x mr) + + + + addpd(xmm3, xmm11) // iteration 2 + movaps(mem(rbx, -3*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -2*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -2*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -1*16), xmm1) + + + + addpd(xmm3, xmm11) // iteration 3 + movaps(mem(rbx, -1*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + sub(imm(0-4*4*8), r9) // b_next += 4*4 (unroll x nr) + + addpd(xmm2, xmm9) + movaps(mem(rbx, 0*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -8*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -7*16), xmm1) + + prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] + prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + //prefetch(2, mem(r9, -8*8)) // prefetch b_next[-8] + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) + sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + addpd(xmm3, xmm11) + addpd(xmm4, xmm15) + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + movddup(mem(rax), xmm6) // load alpha and duplicate + movddup(mem(rbx), xmm7) // load beta and duplicate + + + mov(%7, rsi) // load rs_c + mov(rsi, r8) // make a copy of rs_c + + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + + lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; + + // xmm8: xmm9: xmm10: xmm11: + // ( ab01 ( ab00 ( ab03 ( ab02 + // ab10 ) ab11 ) ab12 ) ab13 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab21 ( ab20 ( ab23 ( ab22 + // ab30 ) ab31 ) ab32 ) ab33 ) + movaps(xmm8, xmm0) + movsd(xmm9, xmm8) + movsd(xmm0, xmm9) + + movaps(xmm10, xmm0) + movsd(xmm11, xmm10) + movsd(xmm0, xmm11) + + movaps(xmm12, xmm0) + movsd(xmm13, xmm12) + movsd(xmm0, xmm13) + + movaps(xmm14, xmm0) + movsd(xmm15, xmm14) + movsd(xmm0, xmm15) + // xmm8: xmm9: xmm10: xmm11: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ) ab11 ) ab12 ) ab13 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab20 ( ab21 ( ab22 ( ab23 + // ab30 ) ab31 ) ab32 ) ab33 ) + + + + // determine if + // c % 16 == 0, AND + // 8*cs_c % 16 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(1), r8) // set ZF if rs_c == 1. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(15), rcx) // set ZF if c & 16 is zero. + setz(bh) // bh = ( ZF == 1 ? 1 : 0 ); + test(imm(15), r12) // set ZF if (8*cs_c) & 16 is zero. + setz(al) // al = ( ZF == 1 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + xorpd(xmm0, xmm0) // set xmm0 to zero. + ucomisd(xmm0, xmm7) // check if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.DCOLSTORED) // jump to column storage case + + + + label(.DGENSTORED) + + movlpd(mem(rcx), xmm0) // load c00 and c10, + movhpd(mem(rcx, rsi, 1), xmm0) + mulpd(xmm6, xmm8) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm8, xmm0) // add the gemm result, + movlpd(xmm0, mem(rcx)) // and store back to memory. + movhpd(xmm0, mem(rcx, rsi, 1)) + add(rdi, rcx) + + movlpd(mem(rdx), xmm1) // load c20 and c30, + movhpd(mem(rdx, rsi, 1), xmm1) + mulpd(xmm6, xmm12) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm12, xmm1) // add the gemm result, + movlpd(xmm1, mem(rdx)) // and store back to memory. + movhpd(xmm1, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + + movlpd(mem(rcx), xmm0) // load c01 and c11, + movhpd(mem(rcx, rsi, 1), xmm0) + mulpd(xmm6, xmm9) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm9, xmm0) // add the gemm result, + movlpd(xmm0, mem(rcx)) // and store back to memory. + movhpd(xmm0, mem(rcx, rsi, 1)) + add(rdi, rcx) + + movlpd(mem(rdx), xmm1) // load c21 and c31, + movhpd(mem(rdx, rsi, 1), xmm1) + mulpd(xmm6, xmm13) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm13, xmm1) // add the gemm result, + movlpd(xmm1, mem(rdx)) // and store back to memory. + movhpd(xmm1, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + + movlpd(mem(rcx), xmm0) // load c02 and c12, + movhpd(mem(rcx, rsi, 1), xmm0) + mulpd(xmm6, xmm10) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm10, xmm0) // add the gemm result, + movlpd(xmm0, mem(rcx)) // and store back to memory. + movhpd(xmm0, mem(rcx, rsi, 1)) + add(rdi, rcx) + + movlpd(mem(rdx), xmm1) // load c22 and c32, + movhpd(mem(rdx, rsi, 1), xmm1) + mulpd(xmm6, xmm14) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm14, xmm1) // add the gemm result, + movlpd(xmm1, mem(rdx)) // and store back to memory. + movhpd(xmm1, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + + movlpd(mem(rcx), xmm0) // load c03 and c13, + movhpd(mem(rcx, rsi, 1), xmm0) + mulpd(xmm6, xmm11) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm11, xmm0) // add the gemm result, + movlpd(xmm0, mem(rcx)) // and store back to memory. + movhpd(xmm0, mem(rcx, rsi, 1)) + + + movlpd(mem(rdx), xmm1) // load c23 and c33, + movhpd(mem(rdx, rsi, 1), xmm1) + mulpd(xmm6, xmm15) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm15, xmm1) // add the gemm result, + movlpd(xmm1, mem(rdx)) // and store back to memory. + movhpd(xmm1, mem(rdx, rsi, 1)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + movaps(mem(rcx), xmm0) // load c00 and c10, + mulpd(xmm6, xmm8) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm8, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c20 and c30, + mulpd(xmm6, xmm12) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm12, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c01 and c11, + mulpd(xmm6, xmm9) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm9, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c21 and c31, + mulpd(xmm6, xmm13) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm13, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c02 and c12, + mulpd(xmm6, xmm10) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm10, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c22 and c32, + mulpd(xmm6, xmm14) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm14, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c03 and c13, + mulpd(xmm6, xmm11) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm11, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + + + movaps(mem(rdx), xmm1) // load c23 and c33, + mulpd(xmm6, xmm15) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm15, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.DCOLSTORBZ) // jump to column storage case + + + + label(.DGENSTORBZ) + // skip loading c00 and c10, + mulpd(xmm6, xmm8) // scale by alpha, + movlpd(xmm8, mem(rcx)) // and store back to memory. + movhpd(xmm8, mem(rcx, rsi, 1)) + add(rdi, rcx) + // skip loading c20 and c30, + mulpd(xmm6, xmm12) // scale by alpha, + movlpd(xmm12, mem(rdx)) // and store back to memory. + movhpd(xmm12, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + // skip loading c01 and c11, + mulpd(xmm6, xmm9) // scale by alpha, + movlpd(xmm9, mem(rcx)) // and store back to memory. + movhpd(xmm9, mem(rcx, rsi, 1)) + add(rdi, rcx) + // skip loading c21 and c31, + mulpd(xmm6, xmm13) // scale by alpha, + movlpd(xmm13, mem(rdx)) // and store back to memory. + movhpd(xmm13, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + // skip loading c02 and c12, + mulpd(xmm6, xmm10) // scale by alpha, + movlpd(xmm10, mem(rcx)) // and store back to memory. + movhpd(xmm10, mem(rcx, rsi, 1)) + add(rdi, rcx) + // skip loading c22 and c32, + mulpd(xmm6, xmm14) // scale by alpha, + movlpd(xmm14, mem(rdx)) // and store back to memory. + movhpd(xmm14, mem(rdx, rsi, 1)) + add(rdi, rdx) + + + // skip loading c03 and c13, + mulpd(xmm6, xmm11) // scale by alpha, + movlpd(xmm11, mem(rcx)) // and store back to memory. + movhpd(xmm11, mem(rcx, rsi, 1)) + + // skip loading c23 and c33, + mulpd(xmm6, xmm15) // scale by alpha, + movlpd(xmm15, mem(rdx)) // and store back to memory. + movhpd(xmm15, mem(rdx, rsi, 1)) + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + // skip loading c00 and c10, + mulpd(xmm6, xmm8) // scale by alpha, + movaps(xmm8, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c20 and c30, + mulpd(xmm6, xmm12) // scale by alpha, + movaps(xmm12, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c01 and c11, + mulpd(xmm6, xmm9) // scale by alpha, + movaps(xmm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c21 and c31, + mulpd(xmm6, xmm13) // scale by alpha, + movaps(xmm13, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c02 and c12, + mulpd(xmm6, xmm10) // scale by alpha, + movaps(xmm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c22 and c32, + mulpd(xmm6, xmm14) // scale by alpha, + movaps(xmm14, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c03 and c13, + mulpd(xmm6, xmm11) // scale by alpha, + movaps(xmm11, mem(rcx)) // and store back to memory. + + // skip loading c23 and c33, + mulpd(xmm6, xmm15) // scale by alpha, + movaps(xmm15, mem(rdx)) // and store back to memory. + + + + + + + + + label(.DDONE) + : // output operands (none) : // input operands @@ -1487,3 +1490,4 @@ void bli_dgemm_penryn_asm_4x4 ); } + diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c index 847b712ba..992d17967 100644 --- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c @@ -34,6 +34,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #if 0 void bli_sgemmtrsm_l_penryn_asm_8x4 ( @@ -75,446 +78,446 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 __asm__ volatile ( - " \n\t" - "movq %2, %%rax \n\t" // load address of a10. - "movq %4, %%rbx \n\t" // load address of b01. - //"movq %10, %%r9 \n\t" // load address of b_next. - " \n\t" - "subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte - "subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. - " \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements - "movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. - "movaps -8 * 16(%%rbx), %%xmm2 \n\t" - " \n\t" - //"movq %6, %%rcx \n\t" // load address of c11 - //"movq %9, %%rdi \n\t" // load cs_c - //"leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - //"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // load address of c + 2*cs_c; - " \n\t" - //"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next - " \n\t" - "xorpd %%xmm3, %%xmm3 \n\t" - "xorpd %%xmm4, %%xmm4 \n\t" - "xorpd %%xmm5, %%xmm5 \n\t" - "xorpd %%xmm6, %%xmm6 \n\t" - " \n\t" - //"prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "xorpd %%xmm8, %%xmm8 \n\t" - "movaps %%xmm8, %%xmm9 \n\t" - //"prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "movaps %%xmm8, %%xmm10 \n\t" - "movaps %%xmm8, %%xmm11 \n\t" - //"prefetcht2 3 * 8(%%rdx) \n\t" // prefetch c + 2*cs_c - "movaps %%xmm8, %%xmm12 \n\t" - "movaps %%xmm8, %%xmm13 \n\t" - //"prefetcht2 3 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 3*cs_c - "movaps %%xmm8, %%xmm14 \n\t" - "movaps %%xmm8, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".LOOPKITER: \n\t" // MAIN LOOP - " \n\t" - //"prefetcht0 1264(%%rax) \n\t" - "prefetcht0 (4*35+1) * 8(%%rax) \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 1 - "movaps -5 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -4 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -4 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -3 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - //"prefetcht0 1328(%%rax) \n\t" - "prefetcht0 (4*37+1) * 8(%%rax) \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 2 - "movaps -3 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -2 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -2 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -1 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 3 - "movaps -1 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "subq $-4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - //"subq $-4 * 4 * 8, %%r9 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps 0 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "subq $-4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -7 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - //"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next[0] - //"prefetcht2 8 * 8(%%r9) \n\t" // prefetch b_next[8] - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CONSIDERKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".LOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "subq $-4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) - "subq $-4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".POSTACCUM: \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of b11. - " \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 - " \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab21 ( ab20 ( ab23 ( ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - "movaps %%xmm9, %%xmm0 \n\t" - "movaps %%xmm8, %%xmm1 \n\t" - "unpcklpd %%xmm8, %%xmm0 \n\t" - "unpckhpd %%xmm9, %%xmm1 \n\t" - " \n\t" - "movaps %%xmm11, %%xmm4 \n\t" - "movaps %%xmm10, %%xmm5 \n\t" - "unpcklpd %%xmm10, %%xmm4 \n\t" - "unpckhpd %%xmm11, %%xmm5 \n\t" - " \n\t" - "movaps %%xmm13, %%xmm2 \n\t" - "movaps %%xmm12, %%xmm3 \n\t" - "unpcklpd %%xmm12, %%xmm2 \n\t" - "unpckhpd %%xmm13, %%xmm3 \n\t" - " \n\t" - "movaps %%xmm15, %%xmm6 \n\t" - "movaps %%xmm14, %%xmm7 \n\t" - "unpcklpd %%xmm14, %%xmm6 \n\t" - "unpckhpd %%xmm15, %%xmm7 \n\t" - " \n\t" - " \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) - " \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) - " \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) - " \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - " \n\t" - "movq %9, %%rax \n\t" // load address of alpha - "movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate - " \n\t" - "movaps 0 * 16(%%rbx), %%xmm8 \n\t" - "movaps 1 * 16(%%rbx), %%xmm12 \n\t" - "mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) - "mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) - "movaps 2 * 16(%%rbx), %%xmm9 \n\t" - "movaps 3 * 16(%%rbx), %%xmm13 \n\t" - "mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) - "mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) - "movaps 4 * 16(%%rbx), %%xmm10 \n\t" - "movaps 5 * 16(%%rbx), %%xmm14 \n\t" - "mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) - "mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) - "movaps 6 * 16(%%rbx), %%xmm11 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) - "mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) - " \n\t" - " \n\t" // (Now scaled by alpha:) - " \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) - " \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) - " \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) - " \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) - " \n\t" - "subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0 - "subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1 - "subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2 - "subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3 - "subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4 - "subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5 - "subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6 - "subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7 - " \n\t" - " \n\t" - " \n\t" - ".TRSM: \n\t" - " \n\t" - " \n\t" - "movq %3, %%rax \n\t" // load address of a11 - "movq %6, %%rcx \n\t" // load address of c11 - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "movq %8, %%rdi \n\t" // load cs_c - "salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) - "salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) - " \n\t" - "leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 - " \n\t" - "movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) - " \n\t" - "mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); - "mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); - " \n\t" - "movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 - "movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 - "movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] - "movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] - "movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] - "movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - " \n\t" - "movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10 - "movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 ) - "subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0 - "subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4 - "mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); - "mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); - " \n\t" - "movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 - "movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 - "movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] - "movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] - "movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] - "movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - " \n\t" - "movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20 - "movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21 - "movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 ) - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 ) - "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; - "addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; - "subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0 - "subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4 - "mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); - "mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); - " \n\t" - "movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 - "movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 - "movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] - "movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] - "movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] - "movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - " \n\t" - "movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30 - "movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31 - "movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32 - "movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 ) - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 ) - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 ) - "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; - "addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; - "addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2; - "addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6; - "subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0 - "subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4 - "mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); - "mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); - " \n\t" - "movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 - "movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 - "movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] - "movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] - "movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] - "movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] - " \n\t" - " \n\t" - " \n\t" + + mov(%2, rax) // load address of a10. + mov(%4, rbx) // load address of b01. + //mov(%10, r9) // load address of b_next. + + sub(imm(0-8*16), rax) // increment pointers to allow byte + sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. + + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements + movaps(mem(rax, -7*16), xmm1) // of a and b. + movaps(mem(rbx, -8*16), xmm2) + + //mov(%6, rcx) // load address of c11 + //mov(%9, rdi) // load cs_c + //lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c; + + //prefetch(2, mem(r9, 0*8)) // prefetch b_next + + xorpd(xmm3, xmm3) + xorpd(xmm4, xmm4) + xorpd(xmm5, xmm5) + xorpd(xmm6, xmm6) + + //prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c + xorpd(xmm8, xmm8) + movaps(xmm8, xmm9) + //prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + movaps(xmm8, xmm10) + movaps(xmm8, xmm11) + //prefetch(2, mem(rdx, 3*8)) // prefetch c + 2*cs_c + movaps(xmm8, xmm12) + movaps(xmm8, xmm13) + //prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c + movaps(xmm8, xmm14) + movaps(xmm8, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CONSIDERKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.LOOPKITER) // MAIN LOOP + + //prefetch(0, mem(rax, 1264)) + prefetch(0, mem(4*35+1)*8(rax)) + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + addpd(xmm3, xmm11) // iteration 1 + movaps(mem(rbx, -5*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -4*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -4*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -3*16), xmm1) + + //prefetch(0, mem(rax, 1328)) + prefetch(0, mem(4*37+1)*8(rax)) + + addpd(xmm3, xmm11) // iteration 2 + movaps(mem(rbx, -3*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -2*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -2*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -1*16), xmm1) + + + addpd(xmm3, xmm11) // iteration 3 + movaps(mem(rbx, -1*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + //sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr) + + addpd(xmm2, xmm9) + movaps(mem(rbx, 0*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -8*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -7*16), xmm1) + + //prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] + //prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] + + + dec(rsi) // i -= 1; + jne(.LOOPKITER) // iterate again if i != 0. + + + + label(.CONSIDERKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.POSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.LOOPKLEFT) // EDGE LOOP + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) + sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.LOOPKLEFT) // iterate again if i != 0. + + + + label(.POSTACCUM) + + addpd(xmm3, xmm11) + addpd(xmm4, xmm15) + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + + + + mov(%5, rbx) // load address of b11. + + // xmm8: xmm9: xmm10: xmm11: + // ( ab01 ( ab00 ( ab03 ( ab02 + // ab10 ) ab11 ) ab12 ) ab13 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab21 ( ab20 ( ab23 ( ab22 + // ab30 ) ab31 ) ab32 ) ab33 ) + movaps(xmm9, xmm0) + movaps(xmm8, xmm1) + unpcklpd(xmm8, xmm0) + unpckhpd(xmm9, xmm1) + + movaps(xmm11, xmm4) + movaps(xmm10, xmm5) + unpcklpd(xmm10, xmm4) + unpckhpd(xmm11, xmm5) + + movaps(xmm13, xmm2) + movaps(xmm12, xmm3) + unpcklpd(xmm12, xmm2) + unpckhpd(xmm13, xmm3) + + movaps(xmm15, xmm6) + movaps(xmm14, xmm7) + unpcklpd(xmm14, xmm6) + unpckhpd(xmm15, xmm7) + + // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) + // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) + // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) + // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) + + mov(%9, rax) // load address of alpha + movddup(mem(rax), xmm15) // load alpha and duplicate + + movaps(mem(rbx, 0*16), xmm8) + movaps(mem(rbx, 1*16), xmm12) + mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) + mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 ) + movaps(mem(rbx, 2*16), xmm9) + movaps(mem(rbx, 3*16), xmm13) + mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 ) + mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 ) + movaps(mem(rbx, 4*16), xmm10) + movaps(mem(rbx, 5*16), xmm14) + mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 ) + mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 ) + movaps(mem(rbx, 6*16), xmm11) + mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) + mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) + + // (Now scaled by alpha:) + // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) + // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) + // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) + // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) + + subpd(xmm0, xmm8) // xmm8 -= xmm0 + subpd(xmm1, xmm9) // xmm9 -= xmm1 + subpd(xmm2, xmm10) // xmm10 -= xmm2 + subpd(xmm3, xmm11) // xmm11 -= xmm3 + subpd(xmm4, xmm12) // xmm12 -= xmm4 + subpd(xmm5, xmm13) // xmm13 -= xmm5 + subpd(xmm6, xmm14) // xmm14 -= xmm6 + subpd(xmm7, xmm15) // xmm15 -= xmm7 + + + + label(.TRSM) + + + mov(%3, rax) // load address of a11 + mov(%6, rcx) // load address of c11 + + mov(%7, rsi) // load rs_c + mov(%8, rdi) // load cs_c + sal(imm(3), rsi) // rs_c *= sizeof( double ) + sal(imm(3), rdi) // cs_c *= sizeof( double ) + + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c + + + + // iteration 0 + + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) + + mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); + mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); + + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 + movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 + movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] + movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] + movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] + movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 1 + + movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) + subpd(xmm0, xmm9) // xmm9 -= xmm0 + subpd(xmm4, xmm13) // xmm13 -= xmm4 + mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); + mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); + + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 + movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 + movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] + movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] + movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] + movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 2 + + movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 + movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + movaps(xmm1, xmm5) // xmm5 = xmm1 + mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 ) + mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 ) + addpd(xmm1, xmm0) // xmm0 += xmm1; + addpd(xmm5, xmm4) // xmm4 += xmm5; + subpd(xmm0, xmm10) // xmm10 -= xmm0 + subpd(xmm4, xmm14) // xmm14 -= xmm4 + mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); + mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); + + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 + movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 + movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] + movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] + movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] + movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 3 + + movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 + movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 + movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + movaps(xmm1, xmm5) // xmm5 = xmm1 + movaps(xmm2, xmm6) // xmm6 = xmm2 + mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 ) + mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 ) + mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 ) + addpd(xmm1, xmm0) // xmm0 += xmm1; + addpd(xmm5, xmm4) // xmm4 += xmm5; + addpd(xmm2, xmm0) // xmm0 += xmm2; + addpd(xmm6, xmm4) // xmm4 += xmm6; + subpd(xmm0, xmm11) // xmm11 -= xmm0 + subpd(xmm4, xmm15) // xmm15 -= xmm4 + mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); + mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); + + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 + movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 + movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] + movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] + movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] + movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] + + + : // output operands (none) : // input operands @@ -540,3 +543,4 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 } + diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c index 5541829c5..c0b94269a 100644 --- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c @@ -34,6 +34,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #if 0 void bli_sgemmtrsm_u_penryn_asm_8x4 ( @@ -75,432 +78,432 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 __asm__ volatile ( - " \n\t" - "movq %2, %%rax \n\t" // load address of a12. - "movq %4, %%rbx \n\t" // load address of b21. - //"movq %10, %%r9 \n\t" // load address of b_next. - " \n\t" - "addq $8 * 16, %%rax \n\t" // increment pointers to allow byte - "addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. - " \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements - "movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. - "movaps -8 * 16(%%rbx), %%xmm2 \n\t" - " \n\t" - "xorpd %%xmm3, %%xmm3 \n\t" - "xorpd %%xmm4, %%xmm4 \n\t" - "xorpd %%xmm5, %%xmm5 \n\t" - "xorpd %%xmm6, %%xmm6 \n\t" - " \n\t" - "xorpd %%xmm8, %%xmm8 \n\t" - "movaps %%xmm8, %%xmm9 \n\t" - "movaps %%xmm8, %%xmm10 \n\t" - "movaps %%xmm8, %%xmm11 \n\t" - "movaps %%xmm8, %%xmm12 \n\t" - "movaps %%xmm8, %%xmm13 \n\t" - "movaps %%xmm8, %%xmm14 \n\t" - "movaps %%xmm8, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".LOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "prefetcht0 1264(%%rax) \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 1 - "movaps -5 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -4 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -4 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -3 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - "prefetcht0 1328(%%rax) \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 2 - "movaps -3 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -2 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -2 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -1 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 3 - "movaps -1 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps 0 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -8 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -7 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CONSIDERKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".LOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 - "movaps -7 * 16(%%rbx), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "movaps %%xmm2, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm2, %%xmm7 \n\t" - "mulpd %%xmm0, %%xmm2 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - "movaps %%xmm7, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm7 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - " \n\t" - "addpd %%xmm2, %%xmm9 \n\t" - "movaps -6 * 16(%%rbx), %%xmm2 \n\t" - "addpd %%xmm4, %%xmm13 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "pshufd $0x4e, %%xmm3, %%xmm5 \n\t" - "mulpd %%xmm0, %%xmm3 \n\t" - "mulpd %%xmm1, %%xmm4 \n\t" - " \n\t" - "addpd %%xmm7, %%xmm8 \n\t" - "addpd %%xmm6, %%xmm12 \n\t" - "movaps %%xmm5, %%xmm6 \n\t" - "mulpd %%xmm0, %%xmm5 \n\t" - "movaps -6 * 16(%%rax), %%xmm0 \n\t" - "mulpd %%xmm1, %%xmm6 \n\t" - "movaps -5 * 16(%%rax), %%xmm1 \n\t" - " \n\t" - " \n\t" - "addq $4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) - "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .LOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".POSTACCUM: \n\t" - " \n\t" - "addpd %%xmm3, %%xmm11 \n\t" - "addpd %%xmm4, %%xmm15 \n\t" - "addpd %%xmm5, %%xmm10 \n\t" - "addpd %%xmm6, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of b11. - " \n\t" - " \n\t" // xmm8: xmm9: xmm10: xmm11: - " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 - " \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) - " \n\t" // - " \n\t" // xmm12: xmm13: xmm14: xmm15: - " \n\t" // ( ab21 ( ab20 ( ab23 ( ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - "movaps %%xmm9, %%xmm0 \n\t" - "movaps %%xmm8, %%xmm1 \n\t" - "unpcklpd %%xmm8, %%xmm0 \n\t" - "unpckhpd %%xmm9, %%xmm1 \n\t" - " \n\t" - "movaps %%xmm11, %%xmm4 \n\t" - "movaps %%xmm10, %%xmm5 \n\t" - "unpcklpd %%xmm10, %%xmm4 \n\t" - "unpckhpd %%xmm11, %%xmm5 \n\t" - " \n\t" - "movaps %%xmm13, %%xmm2 \n\t" - "movaps %%xmm12, %%xmm3 \n\t" - "unpcklpd %%xmm12, %%xmm2 \n\t" - "unpckhpd %%xmm13, %%xmm3 \n\t" - " \n\t" - "movaps %%xmm15, %%xmm6 \n\t" - "movaps %%xmm14, %%xmm7 \n\t" - "unpcklpd %%xmm14, %%xmm6 \n\t" - "unpckhpd %%xmm15, %%xmm7 \n\t" - " \n\t" - " \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) - " \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) - " \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) - " \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - " \n\t" - "movq %9, %%rax \n\t" // load address of alpha - "movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate - " \n\t" - "movaps 0 * 16(%%rbx), %%xmm8 \n\t" - "movaps 1 * 16(%%rbx), %%xmm12 \n\t" - "mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) - "mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) - "movaps 2 * 16(%%rbx), %%xmm9 \n\t" - "movaps 3 * 16(%%rbx), %%xmm13 \n\t" - "mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) - "mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) - "movaps 4 * 16(%%rbx), %%xmm10 \n\t" - "movaps 5 * 16(%%rbx), %%xmm14 \n\t" - "mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) - "mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) - "movaps 6 * 16(%%rbx), %%xmm11 \n\t" - "mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) - "mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) - " \n\t" - " \n\t" // (Now scaled by alpha:) - " \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) - " \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) - " \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) - " \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) - " \n\t" - "subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0 - "subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1 - "subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2 - "subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3 - "subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4 - "subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5 - "subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6 - "subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7 - " \n\t" - " \n\t" - " \n\t" - ".TRSM: \n\t" - " \n\t" - " \n\t" - "movq %3, %%rax \n\t" // load address of a11 - "movq %6, %%rcx \n\t" // load address of c11 - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "movq %8, %%rdi \n\t" // load cs_c - "salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) - "salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) - " \n\t" - "addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c - "addq %%rsi, %%rcx \n\t" - "addq %%rsi, %%rcx \n\t" - "leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 - " \n\t" - "movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) - " \n\t" - "mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); - "mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); - " \n\t" - "movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 - "movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 - "movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] - "movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] - "movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] - "movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - " \n\t" - "movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) - "movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23 - " \n\t" - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 ) - "subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3 - "subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7 - "mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); - "mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); - " \n\t" - "movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 - "movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 - "movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] - "movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] - "movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] - "movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - " \n\t" - "movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) - "movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12 - "movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13 - " \n\t" - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 ) - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 ) - "addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3; - "addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7; - "subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2 - "subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6 - "mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); - "mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); - " \n\t" - "movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 - "movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 - "movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] - "movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] - "movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] - "movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - " \n\t" - "movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) - "movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01 - "movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02 - "movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03 - " \n\t" - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 ) - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 ) - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 ) - "addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2; - "addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6; - "addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3; - "addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7; - "subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1 - "subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5 - "mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); - "mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); - " \n\t" - "movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 - "movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 - "movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] - "movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] - "movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] - "movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] - " \n\t" - " \n\t" - " \n\t" + + mov(%2, rax) // load address of a12. + mov(%4, rbx) // load address of b21. + //mov(%10, r9) // load address of b_next. + + add(imm(8*16), rax) // increment pointers to allow byte + add(imm(8*16), rbx) // offsets in the unrolled iterations. + + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements + movaps(mem(rax, -7*16), xmm1) // of a and b. + movaps(mem(rbx, -8*16), xmm2) + + xorpd(xmm3, xmm3) + xorpd(xmm4, xmm4) + xorpd(xmm5, xmm5) + xorpd(xmm6, xmm6) + + xorpd(xmm8, xmm8) + movaps(xmm8, xmm9) + movaps(xmm8, xmm10) + movaps(xmm8, xmm11) + movaps(xmm8, xmm12) + movaps(xmm8, xmm13) + movaps(xmm8, xmm14) + movaps(xmm8, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CONSIDERKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.LOOPKITER) // MAIN LOOP + + prefetch(0, mem(rax, 1264)) + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + addpd(xmm3, xmm11) // iteration 1 + movaps(mem(rbx, -5*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -4*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -4*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -3*16), xmm1) + + prefetch(0, mem(rax, 1328)) + + addpd(xmm3, xmm11) // iteration 2 + movaps(mem(rbx, -3*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -2*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -2*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -1*16), xmm1) + + + addpd(xmm3, xmm11) // iteration 3 + movaps(mem(rbx, -1*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + add(imm(4*4*8), rax) // a += 4*4 (unroll x mr) + + addpd(xmm2, xmm9) + movaps(mem(rbx, 0*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -8*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -7*16), xmm1) + + + + dec(rsi) // i -= 1; + jne(.LOOPKITER) // iterate again if i != 0. + + + + label(.CONSIDERKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.POSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.LOOPKLEFT) // EDGE LOOP + + addpd(xmm3, xmm11) // iteration 0 + movaps(mem(rbx, -7*16), xmm3) + addpd(xmm4, xmm15) + movaps(xmm2, xmm4) + pshufd(imm(0x4e), xmm2, xmm7) + mulpd(xmm0, xmm2) + mulpd(xmm1, xmm4) + + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + movaps(xmm7, xmm6) + mulpd(xmm0, xmm7) + mulpd(xmm1, xmm6) + + addpd(xmm2, xmm9) + movaps(mem(rbx, -6*16), xmm2) + addpd(xmm4, xmm13) + movaps(xmm3, xmm4) + pshufd(imm(0x4e), xmm3, xmm5) + mulpd(xmm0, xmm3) + mulpd(xmm1, xmm4) + + addpd(xmm7, xmm8) + addpd(xmm6, xmm12) + movaps(xmm5, xmm6) + mulpd(xmm0, xmm5) + movaps(mem(rax, -6*16), xmm0) + mulpd(xmm1, xmm6) + movaps(mem(rax, -5*16), xmm1) + + + add(imm(4*1*8), rax) // a += 4 (1 x mr) + add(imm(4*1*8), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.LOOPKLEFT) // iterate again if i != 0. + + + + label(.POSTACCUM) + + addpd(xmm3, xmm11) + addpd(xmm4, xmm15) + addpd(xmm5, xmm10) + addpd(xmm6, xmm14) + + + + mov(%5, rbx) // load address of b11. + + // xmm8: xmm9: xmm10: xmm11: + // ( ab01 ( ab00 ( ab03 ( ab02 + // ab10 ) ab11 ) ab12 ) ab13 ) + // + // xmm12: xmm13: xmm14: xmm15: + // ( ab21 ( ab20 ( ab23 ( ab22 + // ab30 ) ab31 ) ab32 ) ab33 ) + movaps(xmm9, xmm0) + movaps(xmm8, xmm1) + unpcklpd(xmm8, xmm0) + unpckhpd(xmm9, xmm1) + + movaps(xmm11, xmm4) + movaps(xmm10, xmm5) + unpcklpd(xmm10, xmm4) + unpckhpd(xmm11, xmm5) + + movaps(xmm13, xmm2) + movaps(xmm12, xmm3) + unpcklpd(xmm12, xmm2) + unpckhpd(xmm13, xmm3) + + movaps(xmm15, xmm6) + movaps(xmm14, xmm7) + unpcklpd(xmm14, xmm6) + unpckhpd(xmm15, xmm7) + + // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) + // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) + // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) + // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) + + mov(%9, rax) // load address of alpha + movddup(mem(rax), xmm15) // load alpha and duplicate + + movaps(mem(rbx, 0*16), xmm8) + movaps(mem(rbx, 1*16), xmm12) + mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) + mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 ) + movaps(mem(rbx, 2*16), xmm9) + movaps(mem(rbx, 3*16), xmm13) + mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 ) + mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 ) + movaps(mem(rbx, 4*16), xmm10) + movaps(mem(rbx, 5*16), xmm14) + mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 ) + mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 ) + movaps(mem(rbx, 6*16), xmm11) + mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) + mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) + + // (Now scaled by alpha:) + // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) + // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) + // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) + // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) + + subpd(xmm0, xmm8) // xmm8 -= xmm0 + subpd(xmm1, xmm9) // xmm9 -= xmm1 + subpd(xmm2, xmm10) // xmm10 -= xmm2 + subpd(xmm3, xmm11) // xmm11 -= xmm3 + subpd(xmm4, xmm12) // xmm12 -= xmm4 + subpd(xmm5, xmm13) // xmm13 -= xmm5 + subpd(xmm6, xmm14) // xmm14 -= xmm6 + subpd(xmm7, xmm15) // xmm15 -= xmm7 + + + + label(.TRSM) + + + mov(%3, rax) // load address of a11 + mov(%6, rcx) // load address of c11 + + mov(%7, rsi) // load rs_c + mov(%8, rdi) // load cs_c + sal(imm(3), rsi) // rs_c *= sizeof( double ) + sal(imm(3), rdi) // cs_c *= sizeof( double ) + + add(rsi, rcx) // c11 += (4-1)*rs_c + add(rsi, rcx) + add(rsi, rcx) + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; + + + + // iteration 0 + + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) + + mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); + mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); + + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 + movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 + movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] + movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] + movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] + movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 1 + + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) + movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 + + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) + subpd(xmm3, xmm10) // xmm10 -= xmm3 + subpd(xmm7, xmm14) // xmm14 -= xmm7 + mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); + mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); + + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 + movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 + movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] + movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] + movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] + movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 2 + + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) + movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 + movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 + + movaps(xmm2, xmm6) // xmm6 = xmm2 + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 ) + mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 ) + addpd(xmm3, xmm2) // xmm2 += xmm3; + addpd(xmm7, xmm6) // xmm6 += xmm7; + subpd(xmm2, xmm9) // xmm9 -= xmm2 + subpd(xmm6, xmm13) // xmm13 -= xmm6 + mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); + mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); + + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 + movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 + movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] + movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] + movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] + movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 3 + + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) + movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 + movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 + movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 + + movaps(xmm1, xmm5) // xmm5 = xmm1 + movaps(xmm2, xmm6) // xmm6 = xmm2 + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 ) + mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 ) + mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 ) + addpd(xmm2, xmm1) // xmm1 += xmm2; + addpd(xmm6, xmm5) // xmm5 += xmm6; + addpd(xmm3, xmm1) // xmm1 += xmm3; + addpd(xmm7, xmm5) // xmm5 += xmm7; + subpd(xmm1, xmm8) // xmm8 -= xmm1 + subpd(xmm5, xmm12) // xmm12 -= xmm5 + mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); + mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); + + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 + movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 + movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] + movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] + movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] + movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] + + + : // output operands (none) : // input operands @@ -526,3 +529,4 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 } + diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c index cc6b70808..ab8c846bb 100644 --- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c @@ -34,6 +34,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #if 0 void bli_strsm_l_penryn_asm_8x4 ( @@ -63,138 +66,138 @@ void bli_dtrsm_l_penryn_asm_4x4 __asm__ volatile ( - " \n\t" - "movq %1, %%rbx \n\t" // load address of b11. - " \n\t" - "movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 ) - "movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 ) - "movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 ) - "movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 ) - "movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 ) - "movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 ) - "movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 ) - "movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 ) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rax \n\t" // load address of a11 - "movq %2, %%rcx \n\t" // load address of c11 - " \n\t" - "movq %3, %%rsi \n\t" // load rs_c - "movq %4, %%rdi \n\t" // load cs_c - "salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) - "salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) - " \n\t" - "leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 - " \n\t" - "movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) - " \n\t" - "mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); - "mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); - " \n\t" - "movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 - "movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 - "movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] - "movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] - "movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] - "movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - " \n\t" - "movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10 - "movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 ) - "subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0 - "subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4 - "mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); - "mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); - " \n\t" - "movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 - "movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 - "movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] - "movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] - "movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] - "movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - " \n\t" - "movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20 - "movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21 - "movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 ) - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 ) - "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; - "addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; - "subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0 - "subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4 - "mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); - "mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); - " \n\t" - "movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 - "movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 - "movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] - "movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] - "movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] - "movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] - "addq %%rsi, %%rcx \n\t" // c11 += rs_c - "addq %%rsi, %%rdx \n\t" // c11_2 += rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - " \n\t" - "movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30 - "movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31 - "movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32 - "movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) - " \n\t" - "movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) - "mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 ) - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 ) - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 ) - "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; - "addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; - "addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2; - "addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6; - "subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0 - "subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4 - "mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); - "mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); - " \n\t" - "movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 - "movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 - "movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] - "movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] - "movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] - "movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] - " \n\t" - " \n\t" - " \n\t" + + mov(%1, rbx) // load address of b11. + + movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) + movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) + movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) + movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 ) + movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 ) + movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) + movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) + movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) + + + + mov(%0, rax) // load address of a11 + mov(%2, rcx) // load address of c11 + + mov(%3, rsi) // load rs_c + mov(%4, rdi) // load cs_c + sal(imm(3), rsi) // rs_c *= sizeof( double ) + sal(imm(3), rdi) // cs_c *= sizeof( double ) + + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c + + + + // iteration 0 + + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) + + mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); + mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); + + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 + movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 + movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] + movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] + movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] + movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 1 + + movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) + subpd(xmm0, xmm9) // xmm9 -= xmm0 + subpd(xmm4, xmm13) // xmm13 -= xmm4 + mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); + mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); + + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 + movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 + movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] + movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] + movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] + movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 2 + + movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 + movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + movaps(xmm1, xmm5) // xmm5 = xmm1 + mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 ) + mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 ) + addpd(xmm1, xmm0) // xmm0 += xmm1; + addpd(xmm5, xmm4) // xmm4 += xmm5; + subpd(xmm0, xmm10) // xmm10 -= xmm0 + subpd(xmm4, xmm14) // xmm14 -= xmm4 + mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); + mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); + + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 + movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 + movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] + movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] + movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] + movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] + add(rsi, rcx) // c11 += rs_c + add(rsi, rdx) // c11_2 += rs_c + + + + // iteration 3 + + movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 + movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 + movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) + + movaps(xmm0, xmm4) // xmm4 = xmm0 + movaps(xmm1, xmm5) // xmm5 = xmm1 + movaps(xmm2, xmm6) // xmm6 = xmm2 + mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 ) + mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 ) + mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 ) + mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 ) + addpd(xmm1, xmm0) // xmm0 += xmm1; + addpd(xmm5, xmm4) // xmm4 += xmm5; + addpd(xmm2, xmm0) // xmm0 += xmm2; + addpd(xmm6, xmm4) // xmm4 += xmm6; + subpd(xmm0, xmm11) // xmm11 -= xmm0 + subpd(xmm4, xmm15) // xmm15 -= xmm4 + mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); + mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); + + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 + movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 + movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] + movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] + movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] + movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] + + + : // output operands (none) : // input operands @@ -214,3 +217,4 @@ void bli_dtrsm_l_penryn_asm_4x4 } + diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c index b3f591aa1..de78c59db 100644 --- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c @@ -34,6 +34,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #if 0 void bli_strsm_u_penryn_asm_8x4 ( @@ -63,141 +66,141 @@ void bli_dtrsm_u_penryn_asm_4x4 __asm__ volatile ( - " \n\t" - "movq %1, %%rbx \n\t" // load address of b11. - " \n\t" - "movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 ) - "movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 ) - "movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 ) - "movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 ) - "movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 ) - "movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 ) - "movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 ) - "movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 ) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rax \n\t" // load address of a11 - "movq %2, %%rcx \n\t" // load address of c11 - " \n\t" - "movq %3, %%rsi \n\t" // load rs_c - "movq %4, %%rdi \n\t" // load cs_c - "salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) - "salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) - " \n\t" - "addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c - "addq %%rsi, %%rcx \n\t" - "addq %%rsi, %%rcx \n\t" - "leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 - " \n\t" - "movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) - " \n\t" - "mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); - "mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); - " \n\t" - "movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 - "movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 - "movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] - "movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] - "movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] - "movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - " \n\t" - "movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) - "movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23 - " \n\t" - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 ) - "subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3 - "subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7 - "mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); - "mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); - " \n\t" - "movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 - "movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 - "movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] - "movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] - "movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] - "movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - " \n\t" - "movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) - "movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12 - "movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13 - " \n\t" - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 ) - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 ) - "addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3; - "addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7; - "subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2 - "subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6 - "mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); - "mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); - " \n\t" - "movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 - "movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 - "movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] - "movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] - "movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] - "movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] - "subq %%rsi, %%rcx \n\t" // c11 -= rs_c - "subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - " \n\t" - "movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) - "movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01 - "movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02 - "movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03 - " \n\t" - "movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 - "movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 - "movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 - "mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 ) - "mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 ) - "mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 ) - "mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 ) - "mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 ) - "mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 ) - "addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2; - "addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6; - "addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3; - "addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7; - "subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1 - "subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5 - "mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); - "mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); - " \n\t" - "movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 - "movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 - "movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] - "movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] - "movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] - "movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] - " \n\t" - " \n\t" - " \n\t" + + mov(%1, rbx) // load address of b11. + + movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) + movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) + movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) + movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 ) + movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 ) + movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) + movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) + movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) + + + + mov(%0, rax) // load address of a11 + mov(%2, rcx) // load address of c11 + + mov(%3, rsi) // load rs_c + mov(%4, rdi) // load cs_c + sal(imm(3), rsi) // rs_c *= sizeof( double ) + sal(imm(3), rdi) // cs_c *= sizeof( double ) + + add(rsi, rcx) // c11 += (4-1)*rs_c + add(rsi, rcx) + add(rsi, rcx) + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; + + + + // iteration 0 + + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) + + mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); + mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); + + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 + movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 + movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] + movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] + movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] + movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 1 + + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) + movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 + + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) + subpd(xmm3, xmm10) // xmm10 -= xmm3 + subpd(xmm7, xmm14) // xmm14 -= xmm7 + mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); + mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); + + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 + movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 + movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] + movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] + movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] + movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 2 + + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) + movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 + movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 + + movaps(xmm2, xmm6) // xmm6 = xmm2 + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 ) + mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 ) + addpd(xmm3, xmm2) // xmm2 += xmm3; + addpd(xmm7, xmm6) // xmm6 += xmm7; + subpd(xmm2, xmm9) // xmm9 -= xmm2 + subpd(xmm6, xmm13) // xmm13 -= xmm6 + mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); + mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); + + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 + movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 + movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] + movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] + movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] + movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] + sub(rsi, rcx) // c11 -= rs_c + sub(rsi, rdx) // c11_2 -= rs_c + + + + // iteration 3 + + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) + movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 + movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 + movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 + + movaps(xmm1, xmm5) // xmm5 = xmm1 + movaps(xmm2, xmm6) // xmm6 = xmm2 + movaps(xmm3, xmm7) // xmm7 = xmm3 + mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 ) + mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 ) + mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 ) + mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 ) + mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 ) + mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 ) + addpd(xmm2, xmm1) // xmm1 += xmm2; + addpd(xmm6, xmm5) // xmm5 += xmm6; + addpd(xmm3, xmm1) // xmm1 += xmm3; + addpd(xmm7, xmm5) // xmm5 += xmm7; + subpd(xmm1, xmm8) // xmm8 -= xmm1 + subpd(xmm5, xmm12) // xmm12 -= xmm5 + mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); + mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); + + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 + movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 + movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] + movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] + movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] + movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] + + + : // output operands (none) : // input operands @@ -217,3 +220,4 @@ void bli_dtrsm_u_penryn_asm_4x4 } + diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c index 677de07fa..82866f1fd 100644 --- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c +++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c @@ -37,6 +37,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + void bli_sgemm_piledriver_asm_16x3 ( dim_t k0, @@ -61,828 +64,828 @@ void bli_sgemm_piledriver_asm_16x3 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - "movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "prefetcht0 128(%%rbx) \n\t" // prefetch b - "prefetcht0 64+128(%%rbx) \n\t" // prefetch b - "prefetcht0 128+128(%%rbx) \n\t" // prefetch b - " \n\t" - "addq $32 * 4, %%rax \n\t" - "addq $12 * 4, %%rbx \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) - "leaq (%%rcx,%%rdi,1), %%r10 \n\t" // load address of c + 1*cs_c; - "leaq (%%rcx,%%rdi,2), %%r11 \n\t" // load address of c + 2*cs_c; - " \n\t" - "vbroadcastss -12 * 4(%%rbx), %%xmm1 \n\t" - "vbroadcastss -11 * 4(%%rbx), %%xmm2 \n\t" - "vbroadcastss -10 * 4(%%rbx), %%xmm3 \n\t" - " \n\t" - "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" - "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" - "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" - "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" - "vxorps %%xmm9, %%xmm9, %%xmm9 \n\t" - "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" - "vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" - "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" - "vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" - "vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" - "vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to k_left code. - " \n\t" - " \n\t" - "prefetcht0 16+192(%%rbx) \n\t" // prefetch b - " \n\t" - " \n\t" // iteration 0 - "vmovaps -32 * 4(%%rax), %%xmm0 \n\t" - "prefetcht0 384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -28 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -24 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -20 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss -9 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss -8 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovaps -16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -7 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 64+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -12 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -8 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -4 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss -6 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss -5 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovaps 0 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -4 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 128+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 4 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 8 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 12 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss -3 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss -2 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovaps 16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -1 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 192+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 20 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 24 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 28 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss 0 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" - "addq $4 * 16 * 4, %%rax \n\t" // a += 4*16 (unroll x mr) - " \n\t" - " \n\t" // iteration 4 - "vmovaps -32 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 2 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -28 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -24 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -20 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss 4 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - "prefetcht0 80+192(%%rbx) \n\t" // prefetch b - " \n\t" - " \n\t" // iteration 5 - "vmovaps -16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 64+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -12 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -8 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -4 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss 6 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss 7 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 6 - "vmovaps 0 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 8 * 4(%%rbx), %%xmm3 \n\t" - "prefetcht0 128+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 4 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 8 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 12 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss 9 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss 10 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 7 - "vmovaps 16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 11 * 4(%%rbx), %%xmm3 \n\t" - "addq $8 * 3 * 4, %%rbx \n\t" // a += 4*3 (unroll x nr) - "prefetcht0 192+384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 20 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 24 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 28 * 4(%%rax), %%xmm0 \n\t" - "addq $4 * 16 * 4, %%rax \n\t" // a += 4*16 (unroll x mr) - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss -12 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss -11 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - "vbroadcastss -10 * 4(%%rbx), %%xmm3 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .SLOOPKITER \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" - "je .SPOSTACCUM \n\t" // if i == 0, we're done. - " \n\t" - " \n\t" - "prefetcht0 16+192(%%rbx) \n\t" // prefetch b - " \n\t" - " \n\t" // iteration 0 - "vmovaps -32 * 4(%%rax), %%xmm0 \n\t" - "prefetcht0 384(%%rax) \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -28 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -24 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -20 * 4(%%rax), %%xmm0 \n\t" - "vfmadd231ps %%xmm1, %%xmm0, %%xmm13 \n\t" - "vbroadcastss -9 * 4(%%rbx), %%xmm1 \n\t" - "vfmadd231ps %%xmm2, %%xmm0, %%xmm14 \n\t" - "vbroadcastss -8 * 4(%%rbx), %%xmm2 \n\t" - "vfmadd231ps %%xmm3, %%xmm0, %%xmm15 \n\t" - "vbroadcastss -7 * 4(%%rbx), %%xmm3 \n\t" - " \n\t" - " \n\t" - "addq $1 * 16 * 4, %%rax \n\t" // a += 4*16 (unroll x mr) - "addq $1 * 3 * 4, %%rbx \n\t" // a += 4*3 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .SLOOPKLEFT \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" - "prefetchw 0 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetchw 0 * 8(%%r10) \n\t" // prefetch c + 1*cs_c - "prefetchw 0 * 8(%%r11) \n\t" // prefetch c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" // xmm4: xmm5: xmm6: - " \n\t" // ( ab00 ( ab01 ( ab02 - " \n\t" // ab10 ab11 ab12 - " \n\t" // ab20 ab21 ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) - " \n\t" - " \n\t" // xmm7: xmm8: xmm9: - " \n\t" // ( ab40 ( ab41 ( ab42 - " \n\t" // ab50 ab51 ab52 - " \n\t" // ab60 ab61 ab62 - " \n\t" // ab70 ) ab71 ) ab72 ) - " \n\t" - " \n\t" // xmm10: xmm11: xmm12: - " \n\t" // ( ab80 ( ab01 ( ab02 - " \n\t" // ab90 ab11 ab12 - " \n\t" // abA0 abA1 abA2 - " \n\t" // abB0 ) abB1 ) abB2 ) - " \n\t" - " \n\t" // xmm13: xmm14: xmm15: - " \n\t" // ( abC0 ( abC1 ( abC2 - " \n\t" // abD0 abD1 abD2 - " \n\t" // abE0 abE1 abE2 - " \n\t" // abF0 ) abF1 ) abF2 ) - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rax), %%xmm0 \n\t" // load alpha and duplicate - "vbroadcastss (%%rbx), %%xmm2 \n\t" // load beta and duplicate - " \n\t" - "vmulps %%xmm0, %%xmm4, %%xmm4 \n\t" // scale by alpha - "vmulps %%xmm0, %%xmm5, %%xmm5 \n\t" - "vmulps %%xmm0, %%xmm6, %%xmm6 \n\t" - "vmulps %%xmm0, %%xmm7, %%xmm7 \n\t" - "vmulps %%xmm0, %%xmm8, %%xmm8 \n\t" - "vmulps %%xmm0, %%xmm9, %%xmm9 \n\t" - "vmulps %%xmm0, %%xmm10, %%xmm10 \n\t" - "vmulps %%xmm0, %%xmm11, %%xmm11 \n\t" - "vmulps %%xmm0, %%xmm12, %%xmm12 \n\t" - "vmulps %%xmm0, %%xmm13, %%xmm13 \n\t" - "vmulps %%xmm0, %%xmm14, %%xmm14 \n\t" - "vmulps %%xmm0, %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "prefetcht0 (%%r14) \n\t" // prefetch a_next - "prefetcht0 64(%%r14) \n\t" // prefetch a_next - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) - " \n\t" - //"leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 4*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - "prefetcht0 (%%r15) \n\t" // prefetch b_next - "prefetcht0 64(%%r15) \n\t" // prefetch b_next - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" // load c00:c30 - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm4, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c00:c30 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" // load c40:c70 - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm7, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c40:c70 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" // load c80:cB0 - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c80:cB0 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" // load cC0:cF0 - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm13, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store cC0:cF0 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r10), %%xmm0, %%xmm0 \n\t" // load c01:c31 - "vmovhps (%%r10,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r10,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r10,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm5, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c01:c31 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r10), %%xmm0, %%xmm0 \n\t" // load c41:c71 - "vmovhps (%%r10,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r10,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r10,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c41:c71 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r10), %%xmm0, %%xmm0 \n\t" // load c81:cB1 - "vmovhps (%%r10,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r10,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r10,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm11, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c81:cB1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r10), %%xmm0, %%xmm0 \n\t" // load cC1:cF1 - "vmovhps (%%r10,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r10,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r10,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm14, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store cC1:cF1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r11), %%xmm0, %%xmm0 \n\t" // load c02:c32 - "vmovhps (%%r11,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r11,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r11,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c02:c32 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r11), %%xmm0, %%xmm0 \n\t" // load c42:c72 - "vmovhps (%%r11,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r11,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r11,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm9, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c42:c72 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r11), %%xmm0, %%xmm0 \n\t" // load c82:cB2 - "vmovhps (%%r11,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r11,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r11,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm12, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c82:cB2 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovlps (%%r11), %%xmm0, %%xmm0 \n\t" // load cC2:cF2 - "vmovhps (%%r11,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r11,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%r11,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm15, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store cC2:cF1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231ps 0 * 16(%%rcx), %%xmm2, %%xmm4 \n\t" - "vfmadd231ps 1 * 16(%%rcx), %%xmm2, %%xmm7 \n\t" - "vfmadd231ps 2 * 16(%%rcx), %%xmm2, %%xmm10 \n\t" - "vfmadd231ps 3 * 16(%%rcx), %%xmm2, %%xmm13 \n\t" - " \n\t" - "vmovups %%xmm4, 0 * 16(%%rcx) \n\t" - "vmovups %%xmm7, 1 * 16(%%rcx) \n\t" - "vmovups %%xmm10, 2 * 16(%%rcx) \n\t" - "vmovups %%xmm13, 3 * 16(%%rcx) \n\t" - " \n\t" - "vfmadd231ps 0 * 16(%%r10), %%xmm2, %%xmm5 \n\t" - "vfmadd231ps 1 * 16(%%r10), %%xmm2, %%xmm8 \n\t" - "vfmadd231ps 2 * 16(%%r10), %%xmm2, %%xmm11 \n\t" - "vfmadd231ps 3 * 16(%%r10), %%xmm2, %%xmm14 \n\t" - " \n\t" - "vmovups %%xmm5, 0 * 16(%%r10) \n\t" - "vmovups %%xmm8, 1 * 16(%%r10) \n\t" - "vmovups %%xmm11, 2 * 16(%%r10) \n\t" - "vmovups %%xmm14, 3 * 16(%%r10) \n\t" - " \n\t" - "vfmadd231ps 0 * 16(%%r11), %%xmm2, %%xmm6 \n\t" - "vfmadd231ps 1 * 16(%%r11), %%xmm2, %%xmm9 \n\t" - "vfmadd231ps 2 * 16(%%r11), %%xmm2, %%xmm12 \n\t" - "vfmadd231ps 3 * 16(%%r11), %%xmm2, %%xmm15 \n\t" - " \n\t" - "vmovups %%xmm6, 0 * 16(%%r11) \n\t" - "vmovups %%xmm9, 1 * 16(%%r11) \n\t" - "vmovups %%xmm12, 2 * 16(%%r11) \n\t" - "vmovups %%xmm15, 3 * 16(%%r11) \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%xmm4, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c00:c30 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm7, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c40:c70 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm10, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store c80:cB0 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm13, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" // store cC0:cF0 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r13) \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm5, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c01:c31 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm8, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c41:c71 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm11, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store c81:cB1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm14, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10) \n\t" // store cC1:cF1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r10,%%r13) \n\t" - "leaq (%%r10,%%rsi,4), %%r10 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm6, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c02:c32 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm9, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c42:c72 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm12, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store c82:cB2 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%xmm15, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11) \n\t" // store cC2:cF1 - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%rsi) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm0 \n\t" - "vmovss %%xmm0, (%%r11,%%r13) \n\t" - "leaq (%%r11,%%rsi,4), %%r11 \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%xmm4, 0 * 16(%%rcx) \n\t" - "vmovups %%xmm7, 1 * 16(%%rcx) \n\t" - "vmovups %%xmm10, 2 * 16(%%rcx) \n\t" - "vmovups %%xmm13, 3 * 16(%%rcx) \n\t" - " \n\t" - "vmovups %%xmm5, 0 * 16(%%r10) \n\t" - "vmovups %%xmm8, 1 * 16(%%r10) \n\t" - "vmovups %%xmm11, 2 * 16(%%r10) \n\t" - "vmovups %%xmm14, 3 * 16(%%r10) \n\t" - " \n\t" - "vmovups %%xmm6, 0 * 16(%%r11) \n\t" - "vmovups %%xmm9, 1 * 16(%%r11) \n\t" - "vmovups %%xmm12, 2 * 16(%%r11) \n\t" - "vmovups %%xmm15, 3 * 16(%%r11) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + mov(%10, r14) // load address of a_next. + + prefetch(0, mem(rbx, 128)) // prefetch b + prefetch(0, mem(rbx, 64+128)) // prefetch b + prefetch(0, mem(rbx, 128+128)) // prefetch b + + add(imm(32*4), rax) + add(imm(12*4), rbx) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) + lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; + lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; + + vbroadcastss(mem(rbx, -12*4), xmm1) + vbroadcastss(mem(rbx, -11*4), xmm2) + vbroadcastss(mem(rbx, -10*4), xmm3) + + vxorps(xmm4, xmm4, xmm4) + vxorps(xmm5, xmm5, xmm5) + vxorps(xmm6, xmm6, xmm6) + vxorps(xmm7, xmm7, xmm7) + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm9, xmm9, xmm9) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm11, xmm11, xmm11) + vxorps(xmm12, xmm12, xmm12) + vxorps(xmm13, xmm13, xmm13) + vxorps(xmm14, xmm14, xmm14) + vxorps(xmm15, xmm15, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + je(.SCONSIDKLEFT) // if i == 0, jump to k_left code. + + + prefetch(0, mem(rbx, 16+192)) // prefetch b + + // iteration 0 + vmovaps(mem(rax, -32*4), xmm0) + prefetch(0, mem(rax, 384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -28*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -24*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -20*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, -9*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, -8*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + // iteration 1 + vmovaps(mem(rax, -16*4), xmm0) + vbroadcastss(mem(rbx, -7*4), xmm3) + prefetch(0, mem(rax, 64+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -12*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -8*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -4*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, -6*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, -5*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + // iteration 2 + vmovaps(mem(rax, 0*4), xmm0) + vbroadcastss(mem(rbx, -4*4), xmm3) + prefetch(0, mem(rax, 128+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 4*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 8*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 12*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, -3*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, -2*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + // iteration 3 + vmovaps(mem(rax, 16*4), xmm0) + vbroadcastss(mem(rbx, -1*4), xmm3) + prefetch(0, mem(rax, 192+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 20*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 24*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 28*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, 0*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, 1*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + + add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) + + // iteration 4 + vmovaps(mem(rax, -32*4), xmm0) + vbroadcastss(mem(rbx, 2*4), xmm3) + prefetch(0, mem(rax, 384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -28*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -24*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -20*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, 3*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, 4*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + prefetch(0, mem(rbx, 80+192)) // prefetch b + + // iteration 5 + vmovaps(mem(rax, -16*4), xmm0) + vbroadcastss(mem(rbx, 5*4), xmm3) + prefetch(0, mem(rax, 64+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -12*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -8*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -4*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, 6*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, 7*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + // iteration 6 + vmovaps(mem(rax, 0*4), xmm0) + vbroadcastss(mem(rbx, 8*4), xmm3) + prefetch(0, mem(rax, 128+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 4*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 8*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 12*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, 9*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, 10*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + + // iteration 7 + vmovaps(mem(rax, 16*4), xmm0) + vbroadcastss(mem(rbx, 11*4), xmm3) + add(imm(8*3*4), rbx) // a += 4*3 (unroll x nr) + prefetch(0, mem(rax, 192+384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 20*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 24*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 28*4), xmm0) + add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, -12*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, -11*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + vbroadcastss(mem(rbx, -10*4), xmm3) + + + + + dec(rsi) // i -= 1; + jmp(.SLOOPKITER) // jump to beginning of loop. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + + je(.SPOSTACCUM) // if i == 0, we're done. + + + prefetch(0, mem(rbx, 16+192)) // prefetch b + + // iteration 0 + vmovaps(mem(rax, -32*4), xmm0) + prefetch(0, mem(rax, 384)) + vfmadd231ps(xmm1, xmm0, xmm4) + vfmadd231ps(xmm2, xmm0, xmm5) + vfmadd231ps(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -28*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm7) + vfmadd231ps(xmm2, xmm0, xmm8) + vfmadd231ps(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -24*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm10) + vfmadd231ps(xmm2, xmm0, xmm11) + vfmadd231ps(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -20*4), xmm0) + vfmadd231ps(xmm1, xmm0, xmm13) + vbroadcastss(mem(rbx, -9*4), xmm1) + vfmadd231ps(xmm2, xmm0, xmm14) + vbroadcastss(mem(rbx, -8*4), xmm2) + vfmadd231ps(xmm3, xmm0, xmm15) + vbroadcastss(mem(rbx, -7*4), xmm3) + + + add(imm(1*16*4), rax) // a += 4*16 (unroll x mr) + add(imm(1*3*4), rbx) // a += 4*3 (unroll x nr) + + + dec(rsi) // i -= 1; + jmp(.SLOOPKLEFT) // jump to beginning of loop. + + + + label(.SPOSTACCUM) + + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c + prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c + + + // xmm4: xmm5: xmm6: + // ( ab00 ( ab01 ( ab02 + // ab10 ab11 ab12 + // ab20 ab21 ab22 + // ab30 ) ab31 ) ab32 ) + + // xmm7: xmm8: xmm9: + // ( ab40 ( ab41 ( ab42 + // ab50 ab51 ab52 + // ab60 ab61 ab62 + // ab70 ) ab71 ) ab72 ) + + // xmm10: xmm11: xmm12: + // ( ab80 ( ab01 ( ab02 + // ab90 ab11 ab12 + // abA0 abA1 abA2 + // abB0 ) abB1 ) abB2 ) + + // xmm13: xmm14: xmm15: + // ( abC0 ( abC1 ( abC2 + // abD0 abD1 abD2 + // abE0 abE1 abE2 + // abF0 ) abF1 ) abF2 ) + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rax), xmm0) // load alpha and duplicate + vbroadcastss(mem(rbx), xmm2) // load beta and duplicate + + vmulps(xmm0, xmm4, xmm4) // scale by alpha + vmulps(xmm0, xmm5, xmm5) + vmulps(xmm0, xmm6, xmm6) + vmulps(xmm0, xmm7, xmm7) + vmulps(xmm0, xmm8, xmm8) + vmulps(xmm0, xmm9, xmm9) + vmulps(xmm0, xmm10, xmm10) + vmulps(xmm0, xmm11, xmm11) + vmulps(xmm0, xmm12, xmm12) + vmulps(xmm0, xmm13, xmm13) + vmulps(xmm0, xmm14, xmm14) + vmulps(xmm0, xmm15, xmm15) + + + + prefetch(0, mem(r14)) // prefetch a_next + prefetch(0, mem(r14, 64)) // prefetch a_next + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) + + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + + + + // determine if + // c % 32 == 0, AND + // 4*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + prefetch(0, mem(r15)) // prefetch b_next + prefetch(0, mem(r15, 64)) // prefetch b_next + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm2) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORED) // jump to column storage case + + + + label(.SGENSTORED) + + + vmovlps(mem(rcx), xmm0, xmm0) // load c00:c30 + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm4, xmm0, xmm0) + vmovss(xmm0, mem(rcx)) // store c00:c30 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovlps(mem(rcx), xmm0, xmm0) // load c40:c70 + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm7, xmm0, xmm0) + vmovss(xmm0, mem(rcx)) // store c40:c70 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovlps(mem(rcx), xmm0, xmm0) // load c80:cB0 + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm10, xmm0, xmm0) + vmovss(xmm0, mem(rcx)) // store c80:cB0 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovlps(mem(rcx), xmm0, xmm0) // load cC0:cF0 + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm13, xmm0, xmm0) + vmovss(xmm0, mem(rcx)) // store cC0:cF0 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovlps(mem(r10), xmm0, xmm0) // load c01:c31 + vmovhps(mem(r10, rsi, 1), xmm0, xmm0) + vmovlps(mem(r10, r12, 1), xmm1, xmm1) + vmovhps(mem(r10, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm5, xmm0, xmm0) + vmovss(xmm0, mem(r10)) // store c01:c31 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovlps(mem(r10), xmm0, xmm0) // load c41:c71 + vmovhps(mem(r10, rsi, 1), xmm0, xmm0) + vmovlps(mem(r10, r12, 1), xmm1, xmm1) + vmovhps(mem(r10, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm8, xmm0, xmm0) + vmovss(xmm0, mem(r10)) // store c41:c71 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovlps(mem(r10), xmm0, xmm0) // load c81:cB1 + vmovhps(mem(r10, rsi, 1), xmm0, xmm0) + vmovlps(mem(r10, r12, 1), xmm1, xmm1) + vmovhps(mem(r10, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm11, xmm0, xmm0) + vmovss(xmm0, mem(r10)) // store c81:cB1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovlps(mem(r10), xmm0, xmm0) // load cC1:cF1 + vmovhps(mem(r10, rsi, 1), xmm0, xmm0) + vmovlps(mem(r10, r12, 1), xmm1, xmm1) + vmovhps(mem(r10, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm14, xmm0, xmm0) + vmovss(xmm0, mem(r10)) // store cC1:cF1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovlps(mem(r11), xmm0, xmm0) // load c02:c32 + vmovhps(mem(r11, rsi, 1), xmm0, xmm0) + vmovlps(mem(r11, r12, 1), xmm1, xmm1) + vmovhps(mem(r11, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm6, xmm0, xmm0) + vmovss(xmm0, mem(r11)) // store c02:c32 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovlps(mem(r11), xmm0, xmm0) // load c42:c72 + vmovhps(mem(r11, rsi, 1), xmm0, xmm0) + vmovlps(mem(r11, r12, 1), xmm1, xmm1) + vmovhps(mem(r11, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm9, xmm0, xmm0) + vmovss(xmm0, mem(r11)) // store c42:c72 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovlps(mem(r11), xmm0, xmm0) // load c82:cB2 + vmovhps(mem(r11, rsi, 1), xmm0, xmm0) + vmovlps(mem(r11, r12, 1), xmm1, xmm1) + vmovhps(mem(r11, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm12, xmm0, xmm0) + vmovss(xmm0, mem(r11)) // store c82:cB2 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovlps(mem(r11), xmm0, xmm0) // load cC2:cF2 + vmovhps(mem(r11, rsi, 1), xmm0, xmm0) + vmovlps(mem(r11, r12, 1), xmm1, xmm1) + vmovhps(mem(r11, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmulps(xmm2, xmm0, xmm0) + vaddps(xmm15, xmm0, xmm0) + vmovss(xmm0, mem(r11)) // store cC2:cF1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + + vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4) + vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7) + vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10) + vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13) + + vmovups(xmm4, mem(rcx, 0*16)) + vmovups(xmm7, mem(rcx, 1*16)) + vmovups(xmm10, mem(rcx, 2*16)) + vmovups(xmm13, mem(rcx, 3*16)) + + vfmadd231ps(mem(r10, 0*16), xmm2, xmm5) + vfmadd231ps(mem(r10, 1*16), xmm2, xmm8) + vfmadd231ps(mem(r10, 2*16), xmm2, xmm11) + vfmadd231ps(mem(r10, 3*16), xmm2, xmm14) + + vmovups(xmm5, mem(r10, 0*16)) + vmovups(xmm8, mem(r10, 1*16)) + vmovups(xmm11, mem(r10, 2*16)) + vmovups(xmm14, mem(r10, 3*16)) + + vfmadd231ps(mem(r11, 0*16), xmm2, xmm6) + vfmadd231ps(mem(r11, 1*16), xmm2, xmm9) + vfmadd231ps(mem(r11, 2*16), xmm2, xmm12) + vfmadd231ps(mem(r11, 3*16), xmm2, xmm15) + + vmovups(xmm6, mem(r11, 0*16)) + vmovups(xmm9, mem(r11, 1*16)) + vmovups(xmm12, mem(r11, 2*16)) + vmovups(xmm15, mem(r11, 3*16)) + + + + jmp(.SDONE) // jump to end. + + + + label(.SBETAZERO) + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.SCOLSTORBZ) // jump to column storage case + + + + label(.SGENSTORBZ) + + + vmovaps(xmm4, xmm0) + vmovss(xmm0, mem(rcx)) // store c00:c30 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovaps(xmm7, xmm0) + vmovss(xmm0, mem(rcx)) // store c40:c70 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovaps(xmm10, xmm0) + vmovss(xmm0, mem(rcx)) // store c80:cB0 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovaps(xmm13, xmm0) + vmovss(xmm0, mem(rcx)) // store cC0:cF0 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(rcx, r13, 1)) + lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; + + + vmovaps(xmm5, xmm0) + vmovss(xmm0, mem(r10)) // store c01:c31 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovaps(xmm8, xmm0) + vmovss(xmm0, mem(r10)) // store c41:c71 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovaps(xmm11, xmm0) + vmovss(xmm0, mem(r10)) // store c81:cB1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovaps(xmm14, xmm0) + vmovss(xmm0, mem(r10)) // store cC1:cF1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r10, r13, 1)) + lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; + + + vmovaps(xmm6, xmm0) + vmovss(xmm0, mem(r11)) // store c02:c32 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovaps(xmm9, xmm0) + vmovss(xmm0, mem(r11)) // store c42:c72 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovaps(xmm12, xmm0) + vmovss(xmm0, mem(r11)) // store c82:cB2 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + vmovaps(xmm15, xmm0) + vmovss(xmm0, mem(r11)) // store cC2:cF1 + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, rsi, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm0) + vmovss(xmm0, mem(r11, r13, 1)) + lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORBZ) + + + vmovups(xmm4, mem(rcx, 0*16)) + vmovups(xmm7, mem(rcx, 1*16)) + vmovups(xmm10, mem(rcx, 2*16)) + vmovups(xmm13, mem(rcx, 3*16)) + + vmovups(xmm5, mem(r10, 0*16)) + vmovups(xmm8, mem(r10, 1*16)) + vmovups(xmm11, mem(r10, 2*16)) + vmovups(xmm14, mem(r10, 3*16)) + + vmovups(xmm6, mem(r11, 0*16)) + vmovups(xmm9, mem(r11, 1*16)) + vmovups(xmm12, mem(r11, 2*16)) + vmovups(xmm15, mem(r11, 3*16)) + + + + + + + label(.SDONE) + : // output operands (none) : // input operands @@ -932,676 +935,676 @@ void bli_dgemm_piledriver_asm_8x3 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - "movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "prefetcht0 128(%%rbx) \n\t" // prefetch b - "prefetcht0 64+128(%%rbx) \n\t" // prefetch b - "prefetcht0 128+128(%%rbx) \n\t" // prefetch b - " \n\t" - "addq $16 * 8, %%rax \n\t" - "addq $12 * 8, %%rbx \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - "leaq (%%rcx,%%rdi,1), %%r10 \n\t" // load address of c + 1*cs_c; - "leaq (%%rcx,%%rdi,2), %%r11 \n\t" // load address of c + 2*cs_c; - " \n\t" - "vmovddup -12 * 8(%%rbx), %%xmm1 \n\t" - "vmovddup -11 * 8(%%rbx), %%xmm2 \n\t" - "vmovddup -10 * 8(%%rbx), %%xmm3 \n\t" - " \n\t" - "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" - "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" - "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" - "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" - "vxorpd %%xmm8, %%xmm8, %%xmm8 \n\t" - "vxorpd %%xmm9, %%xmm9, %%xmm9 \n\t" - "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" - "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" - "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" - "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" - "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" - "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to k_left code. - " \n\t" - " \n\t" - "prefetcht0 -32+256(%%rbx) \n\t" // prefetch b - "prefetcht0 32+256(%%rbx) \n\t" // prefetch b - " \n\t" - " \n\t" // iteration 0 - "vmovaps -8 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 384(%%rax) \n\t" // prefetch a - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -7 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -6 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -5 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup -9 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup -8 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovaps -4 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 64+384(%%rax) \n\t" // prefetch a - "vmovddup -7 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -3 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -2 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -1 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup -6 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup -5 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovaps 0 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 128+384(%%rax) \n\t" // prefetch a - "vmovddup -4 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 1 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 2 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 3 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup -3 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup -2 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovaps 4 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 192+384(%%rax) \n\t" // prefetch a - "vmovddup -1 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 5 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 6 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 7 * 16(%%rax), %%xmm0 \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup 0 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup 1 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 4 - "vmovaps -8 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 384(%%rax) \n\t" // prefetch a - "vmovddup 2 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -7 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -6 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -5 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup 3 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup 4 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - "prefetcht0 96+256(%%rbx) \n\t" // prefetch b - " \n\t" - " \n\t" // iteration 5 - "vmovaps -4 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 64+384(%%rax) \n\t" // prefetch a - "vmovddup 5 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -3 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -2 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -1 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup 6 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup 7 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 6 - "vmovaps 0 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 128+384(%%rax) \n\t" // prefetch a - "vmovddup 8 * 8(%%rbx), %%xmm3 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 1 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 2 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 3 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup 9 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup 10 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 7 - "vmovaps 4 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 192+384(%%rax) \n\t" // prefetch a - "vmovddup 11 * 8(%%rbx), %%xmm3 \n\t" - "addq $8 * 3 * 8, %%rbx \n\t" // b += 8*3 (unroll x nr) - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps 5 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps 6 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps 7 * 16(%%rax), %%xmm0 \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup -12 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup -11 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - "vmovddup -10 * 8(%%rbx), %%xmm3 \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .DLOOPKITER \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done. - " \n\t" // else, we prepare to - " \n\t" // enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" - "je .DPOSTACCUM \n\t" // if i == 0, we're done. - " \n\t" - " \n\t" // iteration 0 - "vmovaps -8 * 16(%%rax), %%xmm0 \n\t" - "prefetcht0 512(%%rax) \n\t" // prefetch a - "vfmadd231pd %%xmm1, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm5 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm6 \n\t" - "vmovaps -7 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm7 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm8 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm9 \n\t" - "vmovaps -6 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm10 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm11 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm12 \n\t" - "vmovaps -5 * 16(%%rax), %%xmm0 \n\t" - "vfmadd231pd %%xmm1, %%xmm0, %%xmm13 \n\t" - "vmovddup -9 * 8(%%rbx), %%xmm1 \n\t" - "vfmadd231pd %%xmm2, %%xmm0, %%xmm14 \n\t" - "vmovddup -8 * 8(%%rbx), %%xmm2 \n\t" - "vfmadd231pd %%xmm3, %%xmm0, %%xmm15 \n\t" - "vmovddup -7 * 8(%%rbx), %%xmm3 \n\t" - " \n\t" - " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (1 x mr) - "addq $1 * 3 * 8, %%rbx \n\t" // b += 1*3 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .DLOOPKLEFT \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - "prefetchw 0 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetchw 0 * 8(%%r10) \n\t" // prefetch c + 1*cs_c - "prefetchw 0 * 8(%%r11) \n\t" // prefetch c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" // xmm4: xmm5: xmm6: - " \n\t" // ( ab00 ( ab01 ( ab02 - " \n\t" // ab10 ) ab11 ) ab12 ) - " \n\t" // - " \n\t" // xmm7: xmm8: xmm9: - " \n\t" // ( ab20 ( ab21 ( ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) - " \n\t" // - " \n\t" // xmm10: xmm11: xmm12: - " \n\t" // ( ab40 ( ab41 ( ab42 - " \n\t" // ab50 ) ab51 ) ab52 ) - " \n\t" // - " \n\t" // xmm13: xmm14: xmm15: - " \n\t" // ( ab60 ( ab61 ( ab62 - " \n\t" // ab70 ) ab71 ) ab72 ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vmovddup (%%rax), %%xmm0 \n\t" // load alpha and duplicate - "vmovddup (%%rbx), %%xmm2 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%xmm0, %%xmm4, %%xmm4 \n\t" // scale by alpha - "vmulpd %%xmm0, %%xmm5, %%xmm5 \n\t" - "vmulpd %%xmm0, %%xmm6, %%xmm6 \n\t" - "vmulpd %%xmm0, %%xmm7, %%xmm7 \n\t" - "vmulpd %%xmm0, %%xmm8, %%xmm8 \n\t" - "vmulpd %%xmm0, %%xmm9, %%xmm9 \n\t" - "vmulpd %%xmm0, %%xmm10, %%xmm10 \n\t" - "vmulpd %%xmm0, %%xmm11, %%xmm11 \n\t" - "vmulpd %%xmm0, %%xmm12, %%xmm12 \n\t" - "vmulpd %%xmm0, %%xmm13, %%xmm13 \n\t" - "vmulpd %%xmm0, %%xmm14, %%xmm14 \n\t" - "vmulpd %%xmm0, %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - "prefetcht0 (%%r14) \n\t" // prefetch a_next - "prefetcht0 64(%%r14) \n\t" // prefetch a_next - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - "prefetcht0 (%%r15) \n\t" // prefetch b_next - "prefetcht0 64(%%r15) \n\t" // prefetch b_next - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "je .DGENSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" // xmm4: xmm5: xmm6: - " \n\t" // ( ab00 ( ab01 ( ab02 - " \n\t" // ab10 ) ab11 ) ab12 ) - " \n\t" // - " \n\t" // xmm7: xmm8: xmm9: - " \n\t" // ( ab20 ( ab21 ( ab22 - " \n\t" // ab30 ) ab31 ) ab32 ) - " \n\t" // - " \n\t" // xmm10: xmm11: xmm12: - " \n\t" // ( ab40 ( ab41 ( ab42 - " \n\t" // ab50 ) ab51 ) ab52 ) - " \n\t" // - " \n\t" // xmm13: xmm14: xmm15: - " \n\t" // ( ab60 ( ab61 ( ab62 - " \n\t" // ab70 ) ab71 ) ab72 ) - " \n\t" - " \n\t" - "vfmadd231pd 0 * 16(%%rcx), %%xmm2, %%xmm4 \n\t" - "vfmadd231pd 1 * 16(%%rcx), %%xmm2, %%xmm7 \n\t" - "vfmadd231pd 2 * 16(%%rcx), %%xmm2, %%xmm10 \n\t" - "vfmadd231pd 3 * 16(%%rcx), %%xmm2, %%xmm13 \n\t" - " \n\t" - "vfmadd231pd 0 * 16(%%r10), %%xmm2, %%xmm5 \n\t" - "vfmadd231pd 1 * 16(%%r10), %%xmm2, %%xmm8 \n\t" - "vfmadd231pd 2 * 16(%%r10), %%xmm2, %%xmm11 \n\t" - "vfmadd231pd 3 * 16(%%r10), %%xmm2, %%xmm14 \n\t" - " \n\t" - "vfmadd231pd 0 * 16(%%r11), %%xmm2, %%xmm6 \n\t" - "vfmadd231pd 1 * 16(%%r11), %%xmm2, %%xmm9 \n\t" - "vfmadd231pd 2 * 16(%%r11), %%xmm2, %%xmm12 \n\t" - "vfmadd231pd 3 * 16(%%r11), %%xmm2, %%xmm15 \n\t" - " \n\t" - " \n\t" - "vmovups %%xmm4, 0 * 16(%%rcx) \n\t" - "vmovups %%xmm7, 1 * 16(%%rcx) \n\t" - "vmovups %%xmm10, 2 * 16(%%rcx) \n\t" - "vmovups %%xmm13, 3 * 16(%%rcx) \n\t" - " \n\t" - "vmovups %%xmm5, 0 * 16(%%r10) \n\t" - "vmovups %%xmm8, 1 * 16(%%r10) \n\t" - "vmovups %%xmm11, 2 * 16(%%r10) \n\t" - "vmovups %%xmm14, 3 * 16(%%r10) \n\t" - " \n\t" - "vmovups %%xmm6, 0 * 16(%%r11) \n\t" - "vmovups %%xmm9, 1 * 16(%%r11) \n\t" - "vmovups %%xmm12, 2 * 16(%%r11) \n\t" - "vmovups %%xmm15, 3 * 16(%%r11) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + mov(%10, r14) // load address of a_next. + + prefetch(0, mem(rbx, 128)) // prefetch b + prefetch(0, mem(rbx, 64+128)) // prefetch b + prefetch(0, mem(rbx, 128+128)) // prefetch b + + add(imm(16*8), rax) + add(imm(12*8), rbx) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; + lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; + + vmovddup(mem(rbx, -12*8), xmm1) + vmovddup(mem(rbx, -11*8), xmm2) + vmovddup(mem(rbx, -10*8), xmm3) + + vxorpd(xmm4, xmm4, xmm4) + vxorpd(xmm5, xmm5, xmm5) + vxorpd(xmm6, xmm6, xmm6) + vxorpd(xmm7, xmm7, xmm7) + vxorpd(xmm8, xmm8, xmm8) + vxorpd(xmm9, xmm9, xmm9) + vxorpd(xmm10, xmm10, xmm10) + vxorpd(xmm11, xmm11, xmm11) + vxorpd(xmm12, xmm12, xmm12) + vxorpd(xmm13, xmm13, xmm13) + vxorpd(xmm14, xmm14, xmm14) + vxorpd(xmm15, xmm15, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + je(.DCONSIDKLEFT) // if i == 0, jump to k_left code. + + + prefetch(0, mem(rbx, -32+256)) // prefetch b + prefetch(0, mem(rbx, 32+256)) // prefetch b + + // iteration 0 + vmovaps(mem(rax, -8*16), xmm0) + prefetch(0, mem(rax, 384)) // prefetch a + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -7*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -6*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -5*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, -9*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, -8*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + // iteration 1 + vmovaps(mem(rax, -4*16), xmm0) + prefetch(0, mem(rax, 64+384)) // prefetch a + vmovddup(mem(rbx, -7*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -3*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -2*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -1*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, -6*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, -5*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + // iteration 2 + vmovaps(mem(rax, 0*16), xmm0) + prefetch(0, mem(rax, 128+384)) // prefetch a + vmovddup(mem(rbx, -4*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 1*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 2*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 3*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, -3*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, -2*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + // iteration 3 + vmovaps(mem(rax, 4*16), xmm0) + prefetch(0, mem(rax, 192+384)) // prefetch a + vmovddup(mem(rbx, -1*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 5*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 6*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 7*16), xmm0) + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, 0*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, 1*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + // iteration 4 + vmovaps(mem(rax, -8*16), xmm0) + prefetch(0, mem(rax, 384)) // prefetch a + vmovddup(mem(rbx, 2*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -7*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -6*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -5*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, 3*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, 4*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + prefetch(0, mem(rbx, 96+256)) // prefetch b + + // iteration 5 + vmovaps(mem(rax, -4*16), xmm0) + prefetch(0, mem(rax, 64+384)) // prefetch a + vmovddup(mem(rbx, 5*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -3*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -2*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -1*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, 6*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, 7*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + + // iteration 6 + vmovaps(mem(rax, 0*16), xmm0) + prefetch(0, mem(rax, 128+384)) // prefetch a + vmovddup(mem(rbx, 8*8), xmm3) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 1*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 2*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 3*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, 9*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, 10*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + + // iteration 7 + vmovaps(mem(rax, 4*16), xmm0) + prefetch(0, mem(rax, 192+384)) // prefetch a + vmovddup(mem(rbx, 11*8), xmm3) + add(imm(8*3*8), rbx) // b += 8*3 (unroll x nr) + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, 5*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, 6*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, 7*16), xmm0) + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, -12*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, -11*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + vmovddup(mem(rbx, -10*8), xmm3) + + + + dec(rsi) // i -= 1; + jmp(.DLOOPKITER) // jump to beginning of loop. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done. + // else, we prepare to + // enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + + je(.DPOSTACCUM) // if i == 0, we're done. + + // iteration 0 + vmovaps(mem(rax, -8*16), xmm0) + prefetch(0, mem(rax, 512)) // prefetch a + vfmadd231pd(xmm1, xmm0, xmm4) + vfmadd231pd(xmm2, xmm0, xmm5) + vfmadd231pd(xmm3, xmm0, xmm6) + vmovaps(mem(rax, -7*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm7) + vfmadd231pd(xmm2, xmm0, xmm8) + vfmadd231pd(xmm3, xmm0, xmm9) + vmovaps(mem(rax, -6*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm10) + vfmadd231pd(xmm2, xmm0, xmm11) + vfmadd231pd(xmm3, xmm0, xmm12) + vmovaps(mem(rax, -5*16), xmm0) + vfmadd231pd(xmm1, xmm0, xmm13) + vmovddup(mem(rbx, -9*8), xmm1) + vfmadd231pd(xmm2, xmm0, xmm14) + vmovddup(mem(rbx, -8*8), xmm2) + vfmadd231pd(xmm3, xmm0, xmm15) + vmovddup(mem(rbx, -7*8), xmm3) + + + add(imm(1*8*8), rax) // a += 1*8 (1 x mr) + add(imm(1*3*8), rbx) // b += 1*3 (1 x nr) + + + dec(rsi) // i -= 1; + jmp(.DLOOPKLEFT) // jump to beginning of loop. + + + + label(.DPOSTACCUM) + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c + prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c + + + // xmm4: xmm5: xmm6: + // ( ab00 ( ab01 ( ab02 + // ab10 ) ab11 ) ab12 ) + // + // xmm7: xmm8: xmm9: + // ( ab20 ( ab21 ( ab22 + // ab30 ) ab31 ) ab32 ) + // + // xmm10: xmm11: xmm12: + // ( ab40 ( ab41 ( ab42 + // ab50 ) ab51 ) ab52 ) + // + // xmm13: xmm14: xmm15: + // ( ab60 ( ab61 ( ab62 + // ab70 ) ab71 ) ab72 ) + + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vmovddup(mem(rax), xmm0) // load alpha and duplicate + vmovddup(mem(rbx), xmm2) // load beta and duplicate + + vmulpd(xmm0, xmm4, xmm4) // scale by alpha + vmulpd(xmm0, xmm5, xmm5) + vmulpd(xmm0, xmm6, xmm6) + vmulpd(xmm0, xmm7, xmm7) + vmulpd(xmm0, xmm8, xmm8) + vmulpd(xmm0, xmm9, xmm9) + vmulpd(xmm0, xmm10, xmm10) + vmulpd(xmm0, xmm11, xmm11) + vmulpd(xmm0, xmm12, xmm12) + vmulpd(xmm0, xmm13, xmm13) + vmulpd(xmm0, xmm14, xmm14) + vmulpd(xmm0, xmm15, xmm15) + + + prefetch(0, mem(r14)) // prefetch a_next + prefetch(0, mem(r14, 64)) // prefetch a_next + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + + + + // determine if + // c % 32 == 0, AND + // 8*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + prefetch(0, mem(r15)) // prefetch b_next + prefetch(0, mem(r15, 64)) // prefetch b_next + + // now avoid loading C if beta == 0 + + vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomisd(xmm0, xmm2) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + je(.DGENSTORED) // jump to column storage case + + + + label(.DCOLSTORED) + + // xmm4: xmm5: xmm6: + // ( ab00 ( ab01 ( ab02 + // ab10 ) ab11 ) ab12 ) + // + // xmm7: xmm8: xmm9: + // ( ab20 ( ab21 ( ab22 + // ab30 ) ab31 ) ab32 ) + // + // xmm10: xmm11: xmm12: + // ( ab40 ( ab41 ( ab42 + // ab50 ) ab51 ) ab52 ) + // + // xmm13: xmm14: xmm15: + // ( ab60 ( ab61 ( ab62 + // ab70 ) ab71 ) ab72 ) + + + vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4) + vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7) + vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10) + vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13) + + vfmadd231pd(mem(r10, 0*16), xmm2, xmm5) + vfmadd231pd(mem(r10, 1*16), xmm2, xmm8) + vfmadd231pd(mem(r10, 2*16), xmm2, xmm11) + vfmadd231pd(mem(r10, 3*16), xmm2, xmm14) + + vfmadd231pd(mem(r11, 0*16), xmm2, xmm6) + vfmadd231pd(mem(r11, 1*16), xmm2, xmm9) + vfmadd231pd(mem(r11, 2*16), xmm2, xmm12) + vfmadd231pd(mem(r11, 3*16), xmm2, xmm15) + + + vmovups(xmm4, mem(rcx, 0*16)) + vmovups(xmm7, mem(rcx, 1*16)) + vmovups(xmm10, mem(rcx, 2*16)) + vmovups(xmm13, mem(rcx, 3*16)) + + vmovups(xmm5, mem(r10, 0*16)) + vmovups(xmm8, mem(r10, 1*16)) + vmovups(xmm11, mem(r10, 2*16)) + vmovups(xmm14, mem(r10, 3*16)) + + vmovups(xmm6, mem(r11, 0*16)) + vmovups(xmm9, mem(r11, 1*16)) + vmovups(xmm12, mem(r11, 2*16)) + vmovups(xmm15, mem(r11, 3*16)) + + + + /* - "vmovupd (%%rcx), %%xmm0 \n\t" // load c00:c10 - "vmovupd (%%rcx,%%r12), %%xmm1 \n\t" // load c20:c30 - "vfmadd231pd %%xmm2, %%xmm0, %%xmm4 \n\t" - "vfmadd231pd %%xmm2, %%xmm1, %%xmm7 \n\t" - "vmovupd %%xmm4, (%%rcx) \n\t" // store c00:c10 - "vmovupd %%xmm7, (%%rcx,%%r12) \n\t" // store c20:c30 - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load c40:c50 - "vmovupd (%%rdx,%%r12), %%xmm1 \n\t" // load c60:c70 - "vfmadd213pd %%xmm10, %%xmm2, %%xmm0 \n\t" - "vfmadd213pd %%xmm13, %%xmm2, %%xmm1 \n\t" - "vmovupd %%xmm0, (%%rdx) \n\t" // store c40:c50 - "vmovupd %%xmm1, (%%rdx,%%r12) \n\t" // store c60:c70 - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load c01:c11 - "vmovupd (%%rcx,%%r12), %%xmm1 \n\t" // load c21:c31 - "vfmadd213pd %%xmm5, %%xmm2, %%xmm0 \n\t" - "vfmadd213pd %%xmm8, %%xmm2, %%xmm1 \n\t" - "vmovupd %%xmm0, (%%rcx) \n\t" // store c01:c11 - "vmovupd %%xmm1, (%%rcx,%%r12) \n\t" // store c21:c31 - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load c41:c51 - "vmovupd (%%rdx,%%r12), %%xmm1 \n\t" // load c61:c71 - "vfmadd213pd %%xmm11, %%xmm2, %%xmm0 \n\t" - "vfmadd213pd %%xmm14, %%xmm2, %%xmm1 \n\t" - "vmovupd %%xmm0, (%%rdx) \n\t" // store c41:c51 - "vmovupd %%xmm1, (%%rdx,%%r12) \n\t" // store c61:c71 - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load c02:c12 - "vmovupd (%%rcx,%%r12), %%xmm1 \n\t" // load c22:c32 - "vfmadd213pd %%xmm6, %%xmm2, %%xmm0 \n\t" - "vfmadd213pd %%xmm9, %%xmm2, %%xmm1 \n\t" - "vmovupd %%xmm0, (%%rcx) \n\t" // store c02:c12 - "vmovupd %%xmm1, (%%rcx,%%r12) \n\t" // store c22:c32 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load c42:c52 - "vmovupd (%%rdx,%%r12), %%xmm1 \n\t" // load c62:c72 - "vfmadd213pd %%xmm12, %%xmm2, %%xmm0 \n\t" - "vfmadd213pd %%xmm15, %%xmm2, %%xmm1 \n\t" - "vmovupd %%xmm0, (%%rdx) \n\t" // store c42:c52 - "vmovupd %%xmm1, (%%rdx,%%r12) \n\t" // store c62:c72 + vmovupd(mem(rcx), xmm0) // load c00:c10 + vmovupd(mem(rcx, r12, 1), xmm1) // load c20:c30 + vfmadd231pd(xmm2, xmm0, xmm4) + vfmadd231pd(xmm2, xmm1, xmm7) + vmovupd(xmm4, mem(rcx)) // store c00:c10 + vmovupd(xmm7, mem(rcx, r12, 1)) // store c20:c30 + add(rdi, rcx) + + vmovupd(mem(rdx), xmm0) // load c40:c50 + vmovupd(mem(rdx, r12, 1), xmm1) // load c60:c70 + vfmadd213pd(xmm10, xmm2, xmm0) + vfmadd213pd(xmm13, xmm2, xmm1) + vmovupd(xmm0, mem(rdx)) // store c40:c50 + vmovupd(xmm1, mem(rdx, r12, 1)) // store c60:c70 + add(rdi, rdx) + + + vmovupd(mem(rcx), xmm0) // load c01:c11 + vmovupd(mem(rcx, r12, 1), xmm1) // load c21:c31 + vfmadd213pd(xmm5, xmm2, xmm0) + vfmadd213pd(xmm8, xmm2, xmm1) + vmovupd(xmm0, mem(rcx)) // store c01:c11 + vmovupd(xmm1, mem(rcx, r12, 1)) // store c21:c31 + add(rdi, rcx) + + vmovupd(mem(rdx), xmm0) // load c41:c51 + vmovupd(mem(rdx, r12, 1), xmm1) // load c61:c71 + vfmadd213pd(xmm11, xmm2, xmm0) + vfmadd213pd(xmm14, xmm2, xmm1) + vmovupd(xmm0, mem(rdx)) // store c41:c51 + vmovupd(xmm1, mem(rdx, r12, 1)) // store c61:c71 + add(rdi, rdx) + + + vmovupd(mem(rcx), xmm0) // load c02:c12 + vmovupd(mem(rcx, r12, 1), xmm1) // load c22:c32 + vfmadd213pd(xmm6, xmm2, xmm0) + vfmadd213pd(xmm9, xmm2, xmm1) + vmovupd(xmm0, mem(rcx)) // store c02:c12 + vmovupd(xmm1, mem(rcx, r12, 1)) // store c22:c32 + + vmovupd(mem(rdx), xmm0) // load c42:c52 + vmovupd(mem(rdx, r12, 1), xmm1) // load c62:c72 + vfmadd213pd(xmm12, xmm2, xmm0) + vfmadd213pd(xmm15, xmm2, xmm1) + vmovupd(xmm0, mem(rdx)) // store c42:c52 + vmovupd(xmm1, mem(rdx, r12, 1)) // store c62:c72 */ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c00:c10 - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm4, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx) \n\t" // store c00:c10 - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c20:c30 - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm7, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // store c20:c30 - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c40:c50 - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx) \n\t" // store c40:c50 - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c60:c70 - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm13, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // store c60:c70 - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c01:c11 - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm5, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx) \n\t" // store c01:c11 - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c21:c31 - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // store c21:c31 - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c41:c51 - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm11, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx) \n\t" // store c41:c51 - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c61:c71 - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm14, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // store c61:c71 - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c02:c12 - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx) \n\t" // store c02:c12 - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c22:c32 - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm9, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // store c22:c32 - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c42:c52 - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm12, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx) \n\t" // store c42:c52 - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c62:c72 - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm15, %%xmm0, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // store c62:c72 - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovlpd %%xmm4, (%%rcx) \n\t" - "vmovhpd %%xmm4, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm7, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm7, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovlpd %%xmm10, (%%rdx) \n\t" - "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm13, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm13, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovlpd %%xmm5, (%%rcx) \n\t" - "vmovhpd %%xmm5, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm8, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm8, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovlpd %%xmm11, (%%rdx) \n\t" - "vmovhpd %%xmm11, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm14, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm14, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovlpd %%xmm6, (%%rcx) \n\t" - "vmovhpd %%xmm6, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm9, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm9, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovlpd %%xmm12, (%%rdx) \n\t" - "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm15, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm15, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%xmm4, (%%rcx) \n\t" - "vmovupd %%xmm7, (%%rcx,%%r12) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%xmm10, (%%rdx) \n\t" - "vmovupd %%xmm13, (%%rdx,%%r12) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%xmm5, (%%rcx) \n\t" - "vmovupd %%xmm8, (%%rcx,%%r12) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%xmm11, (%%rdx) \n\t" - "vmovupd %%xmm14, (%%rdx,%%r12) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%xmm6, (%%rcx) \n\t" - "vmovupd %%xmm9, (%%rcx,%%r12) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%xmm12, (%%rdx) \n\t" - "vmovupd %%xmm15, (%%rdx,%%r12) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" + + + + jmp(.DDONE) // jump to end. + + + + label(.DGENSTORED) + + + vmovlpd(mem(rcx), xmm0, xmm0) // load c00:c10 + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm4, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx)) // store c00:c10 + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20:c30 + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm7, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx, r12, 1)) // store c20:c30 + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) + + vmovlpd(mem(rdx), xmm0, xmm0) // load c40:c50 + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm10, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx)) // store c40:c50 + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60:c70 + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm13, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx, r12, 1)) // store c60:c70 + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) + + + vmovlpd(mem(rcx), xmm0, xmm0) // load c01:c11 + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm5, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx)) // store c01:c11 + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21:c31 + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm8, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx, r12, 1)) // store c21:c31 + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) + + vmovlpd(mem(rdx), xmm0, xmm0) // load c41:c51 + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm11, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx)) // store c41:c51 + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61:c71 + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm14, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx, r12, 1)) // store c61:c71 + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) + + + vmovlpd(mem(rcx), xmm0, xmm0) // load c02:c12 + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm6, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx)) // store c02:c12 + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22:c32 + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm9, xmm0, xmm0) + vmovlpd(xmm0, mem(rcx, r12, 1)) // store c22:c32 + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) + + vmovlpd(mem(rdx), xmm0, xmm0) // load c42:c52 + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm12, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx)) // store c42:c52 + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62:c72 + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) + vaddpd(xmm15, xmm0, xmm0) + vmovlpd(xmm0, mem(rdx, r12, 1)) // store c62:c72 + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) + + + + jmp(.DDONE) // jump to end. + + + + label(.DBETAZERO) + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.DCOLSTORBZ) // jump to column storage case + + + + label(.DGENSTORBZ) + + + vmovlpd(xmm4, mem(rcx)) + vmovhpd(xmm4, mem(rcx, rsi, 1)) + vmovlpd(xmm7, mem(rcx, r12, 1)) + vmovhpd(xmm7, mem(rcx, r13, 1)) + add(rdi, rcx) + vmovlpd(xmm10, mem(rdx)) + vmovhpd(xmm10, mem(rdx, rsi, 1)) + vmovlpd(xmm13, mem(rdx, r12, 1)) + vmovhpd(xmm13, mem(rdx, r13, 1)) + add(rdi, rdx) + + vmovlpd(xmm5, mem(rcx)) + vmovhpd(xmm5, mem(rcx, rsi, 1)) + vmovlpd(xmm8, mem(rcx, r12, 1)) + vmovhpd(xmm8, mem(rcx, r13, 1)) + add(rdi, rcx) + vmovlpd(xmm11, mem(rdx)) + vmovhpd(xmm11, mem(rdx, rsi, 1)) + vmovlpd(xmm14, mem(rdx, r12, 1)) + vmovhpd(xmm14, mem(rdx, r13, 1)) + add(rdi, rdx) + + vmovlpd(xmm6, mem(rcx)) + vmovhpd(xmm6, mem(rcx, rsi, 1)) + vmovlpd(xmm9, mem(rcx, r12, 1)) + vmovhpd(xmm9, mem(rcx, r13, 1)) + add(rdi, rcx) + vmovlpd(xmm12, mem(rdx)) + vmovhpd(xmm12, mem(rdx, rsi, 1)) + vmovlpd(xmm15, mem(rdx, r12, 1)) + vmovhpd(xmm15, mem(rdx, r13, 1)) + add(rdi, rdx) + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + + vmovupd(xmm4, mem(rcx)) + vmovupd(xmm7, mem(rcx, r12, 1)) + add(rdi, rcx) + vmovupd(xmm10, mem(rdx)) + vmovupd(xmm13, mem(rdx, r12, 1)) + add(rdi, rdx) + + vmovupd(xmm5, mem(rcx)) + vmovupd(xmm8, mem(rcx, r12, 1)) + add(rdi, rcx) + vmovupd(xmm11, mem(rdx)) + vmovupd(xmm14, mem(rdx, r12, 1)) + add(rdi, rdx) + + vmovupd(xmm6, mem(rcx)) + vmovupd(xmm9, mem(rcx, r12, 1)) + add(rdi, rcx) + vmovupd(xmm12, mem(rdx)) + vmovupd(xmm15, mem(rdx, r12, 1)) + add(rdi, rdx) + + + + + + label(.DDONE) + : // output operands (none) : // input operands @@ -1651,505 +1654,505 @@ void bli_cgemm_piledriver_asm_4x2 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - "movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) - "leaq (%%rcx,%%rdi,1), %%r10 \n\t" // load address of c + 1*cs_c; - " \n\t" - "addq $32 * 4, %%rax \n\t" - "addq $16 * 4, %%rbx \n\t" - " \n\t" - " \n\t" - "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" - "vxorps %%xmm9, %%xmm9, %%xmm9 \n\t" - "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" - "vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" - "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" - "vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" - "vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" - "vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" - //"vzeroall \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".CLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to k_left code. - " \n\t" - " \n\t" - "prefetcht0 256(%%rbx) \n\t" - "prefetcht0 512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 0 - "vmovaps -32 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -16 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -28 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss -15 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss -14 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss -13 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovaps -24 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -12 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -20 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss -11 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss -10 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss -9 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 64+256(%%rbx) \n\t" - "prefetcht0 64+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovaps -16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -8 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -12 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss -7 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss -6 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss -5 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovaps -8 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -4 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -4 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss -3 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss -2 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss -1 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 128+256(%%rbx) \n\t" - "prefetcht0 128+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 4 - "vmovaps 0 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 0 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 4 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss 2 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 5 - "vmovaps 8 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 4 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 12 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss 6 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss 7 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 128+256(%%rbx) \n\t" - "prefetcht0 128+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 6 - "vmovaps 16 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 8 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 20 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss 9 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss 10 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss 11 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 7 - "vmovaps 24 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss 12 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 28 * 4(%%rax), %%xmm1 \n\t" - "addq $8 * 4 * 8, %%rax \n\t" // a += 8*2 (unroll x mr) - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss 13 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss 14 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss 15 * 4(%%rbx), %%xmm7 \n\t" - "addq $8 * 2 * 8, %%rbx \n\t" // b += 8*2 (unroll x nr) - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .CLOOPKITER \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".CLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" - "je .CPOSTACCUM \n\t" // if i == 0, we're done. - " \n\t" - "prefetcht0 256(%%rbx) \n\t" - "prefetcht0 512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 0 - "vmovaps -32 * 4(%%rax), %%xmm0 \n\t" - "vbroadcastss -16 * 4(%%rbx), %%xmm4 \n\t" - "vfmadd231ps %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -28 * 4(%%rax), %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm4, %%xmm12 \n\t" - "vbroadcastss -15 * 4(%%rbx), %%xmm5 \n\t" - "vfmadd231ps %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231ps %%xmm1, %%xmm5, %%xmm13 \n\t" - "vbroadcastss -14 * 4(%%rbx), %%xmm6 \n\t" - "vfmadd231ps %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231ps %%xmm1, %%xmm6, %%xmm14 \n\t" - "vbroadcastss -13 * 4(%%rbx), %%xmm7 \n\t" - "vfmadd231ps %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231ps %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" - "addq $1 * 4 * 8, %%rax \n\t" // a += 1*2 (1 x mr) - "addq $1 * 2 * 8, %%rbx \n\t" // b += 1*2 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .CLOOPKLEFT \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - ".CPOSTACCUM: \n\t" - " \n\t" - " \n\t" - "prefetchw 0 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetchw 0 * 8(%%r10) \n\t" // prefetch c + 1*cs_c - " \n\t" - " \n\t" - "vpermilps $0xb1, %%xmm9, %%xmm9 \n\t" - "vpermilps $0xb1, %%xmm11, %%xmm11 \n\t" - "vpermilps $0xb1, %%xmm13, %%xmm13 \n\t" - "vpermilps $0xb1, %%xmm15, %%xmm15 \n\t" - " \n\t" - "vaddsubps %%xmm9, %%xmm8, %%xmm8 \n\t" - "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" - "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" - "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" // xmm8: xmm10: - " \n\t" // ( ab00 ( ab01 - " \n\t" // ab10 ab11 - " \n\t" // ab20 ab21 - " \n\t" // ab30 ) ab31 ) - " \n\t" - " \n\t" // xmm12: xmm14: - " \n\t" // ( ab40 ( ab41 - " \n\t" // ab50 ab51 - " \n\t" // ab60 ab61 - " \n\t" // ab70 ) ab71 ) - " \n\t" - " \n\t" - "prefetcht0 (%%r14) \n\t" // prefetch a_next - "prefetcht0 64(%%r14) \n\t" // prefetch a_next - " \n\t" - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastss (%%rax), %%xmm0 \n\t" // load alpha_r and duplicate - "vbroadcastss 4(%%rax), %%xmm1 \n\t" // load alpha_i and duplicate - " \n\t" - "vpermilps $0xb1, %%xmm8, %%xmm9 \n\t" - "vpermilps $0xb1, %%xmm10, %%xmm11 \n\t" - "vpermilps $0xb1, %%xmm12, %%xmm13 \n\t" - "vpermilps $0xb1, %%xmm14, %%xmm15 \n\t" - " \n\t" - "vmulps %%xmm8, %%xmm0, %%xmm8 \n\t" - "vmulps %%xmm10, %%xmm0, %%xmm10 \n\t" - "vmulps %%xmm12, %%xmm0, %%xmm12 \n\t" - "vmulps %%xmm14, %%xmm0, %%xmm14 \n\t" - " \n\t" - "vmulps %%xmm9, %%xmm1, %%xmm9 \n\t" - "vmulps %%xmm11, %%xmm1, %%xmm11 \n\t" - "vmulps %%xmm13, %%xmm1, %%xmm13 \n\t" - "vmulps %%xmm15, %%xmm1, %%xmm15 \n\t" - " \n\t" - "vaddsubps %%xmm9, %%xmm8, %%xmm8 \n\t" - "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" - "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" - "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rbx), %%xmm6 \n\t" // load beta_r and duplicate - "vbroadcastss 4(%%rbx), %%xmm7 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) - " \n\t" - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" - "prefetcht0 (%%r15) \n\t" // prefetch b_next - "prefetcht0 64(%%r15) \n\t" // prefetch b_next - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "vucomiss %%xmm0, %%xmm6 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomiss %%xmm0, %%xmm7 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" // load c00:c10 - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load c20:c30 - "vmovhps (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" - "vpermilps $0xb1, %%xmm0, %%xmm1 \n\t" - "vpermilps $0xb1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubps %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovlps %%xmm0, (%%rcx) \n\t" // store c00:c10 - "vmovhps %%xmm0, (%%rcx,%%rsi) \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulps %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubps %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddps %%xmm12, %%xmm2, %%xmm2 \n\t" - "vmovlps %%xmm2, (%%rcx,%%r12) \n\t" // store c20:c30 - "vmovhps %%xmm2, (%%rcx,%%r13) \n\t" - " \n\t" - " \n\t" - " \n\t" - "vmovlps (%%r10), %%xmm0, %%xmm0 \n\t" // load c01:c11 - "vmovhps (%%r10,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%r10,%%r12), %%xmm2, %%xmm2 \n\t" // load c21:c31 - "vmovhps (%%r10,%%r13), %%xmm2, %%xmm2 \n\t" - "vpermilps $0xb1, %%xmm0, %%xmm1 \n\t" - "vpermilps $0xb1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubps %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovlps %%xmm0, (%%r10) \n\t" // store c01:c11 - "vmovhps %%xmm0, (%%r10,%%rsi) \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulps %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubps %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddps %%xmm14, %%xmm2, %%xmm2 \n\t" - "vmovlps %%xmm2, (%%r10,%%r12) \n\t" // store c21:c31 - "vmovhps %%xmm2, (%%r10,%%r13) \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups (%%rcx), %%xmm0 \n\t" // load c00:c10 - "vmovups 16(%%rcx), %%xmm2 \n\t" // load c20:c30 - "vpermilps $0xb1, %%xmm0, %%xmm1 \n\t" - "vpermilps $0xb1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubps %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%rcx) \n\t" // store c00:c10 - " \n\t" - "vmulps %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulps %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubps %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddps %%xmm12, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, 16(%%rcx) \n\t" // store c20:c30 - " \n\t" - " \n\t" - " \n\t" - "vmovups (%%r10), %%xmm0 \n\t" // load c01:c11 - "vmovups 16(%%r10), %%xmm2 \n\t" // load c21:c31 - "vpermilps $0xb1, %%xmm0, %%xmm1 \n\t" - "vpermilps $0xb1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulps %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulps %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubps %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddps %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%r10) \n\t" // store c01:c11 - " \n\t" - "vmulps %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulps %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubps %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddps %%xmm14, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, 16(%%r10) \n\t" // store c21:c31 - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovlps %%xmm8, (%%rcx) \n\t" // store c00:c10 - "vmovhps %%xmm8, (%%rcx,%%rsi) \n\t" - " \n\t" - "vmovlps %%xmm12, (%%rcx,%%r12) \n\t" // store c20:c30 - "vmovhps %%xmm12, (%%rcx,%%r13) \n\t" - " \n\t" - "vmovlps %%xmm10, (%%r10) \n\t" // store c01:c11 - "vmovhps %%xmm10, (%%r10,%%rsi) \n\t" - " \n\t" - "vmovlps %%xmm14, (%%r10,%%r12) \n\t" // store c21:c31 - "vmovhps %%xmm14, (%%r10,%%r13) \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%xmm8, (%%rcx) \n\t" // store c00:c10 - "vmovups %%xmm12, 16(%%rcx) \n\t" // store c20:c30 - " \n\t" - "vmovups %%xmm10, (%%r10) \n\t" // store c01:c11 - "vmovups %%xmm14, 16(%%r10) \n\t" // store c21:c31 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CDONE: \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + mov(%10, r14) // load address of a_next. + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) + lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; + + add(imm(32*4), rax) + add(imm(16*4), rbx) + + + vxorps(xmm8, xmm8, xmm8) + vxorps(xmm9, xmm9, xmm9) + vxorps(xmm10, xmm10, xmm10) + vxorps(xmm11, xmm11, xmm11) + vxorps(xmm12, xmm12, xmm12) + vxorps(xmm13, xmm13, xmm13) + vxorps(xmm14, xmm14, xmm14) + vxorps(xmm15, xmm15, xmm15) + //vzeroall() + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.CLOOPKITER) // MAIN LOOP + + + je(.CCONSIDKLEFT) // if i == 0, jump to k_left code. + + + prefetch(0, mem(rbx, 256)) + prefetch(0, mem(rax, 512)) + + // iteration 0 + vmovaps(mem(rax, -32*4), xmm0) + vbroadcastss(mem(rbx, -16*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -28*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, -15*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, -14*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, -13*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + // iteration 1 + vmovaps(mem(rax, -24*4), xmm0) + vbroadcastss(mem(rbx, -12*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -20*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, -11*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, -10*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, -9*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 64+256)) + prefetch(0, mem(rax, 64+512)) + + // iteration 2 + vmovaps(mem(rax, -16*4), xmm0) + vbroadcastss(mem(rbx, -8*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -12*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, -7*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, -6*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, -5*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + // iteration 3 + vmovaps(mem(rax, -8*4), xmm0) + vbroadcastss(mem(rbx, -4*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -4*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, -3*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, -2*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, -1*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 128+256)) + prefetch(0, mem(rax, 128+512)) + + // iteration 4 + vmovaps(mem(rax, 0*4), xmm0) + vbroadcastss(mem(rbx, 0*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 4*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, 1*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, 2*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, 3*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + // iteration 5 + vmovaps(mem(rax, 8*4), xmm0) + vbroadcastss(mem(rbx, 4*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 12*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, 5*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, 6*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, 7*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 128+256)) + prefetch(0, mem(rax, 128+512)) + + // iteration 6 + vmovaps(mem(rax, 16*4), xmm0) + vbroadcastss(mem(rbx, 8*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 20*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, 9*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, 10*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, 11*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + // iteration 7 + vmovaps(mem(rax, 24*4), xmm0) + vbroadcastss(mem(rbx, 12*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 28*4), xmm1) + add(imm(8*4*8), rax) // a += 8*2 (unroll x mr) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, 13*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, 14*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, 15*4), xmm7) + add(imm(8*2*8), rbx) // b += 8*2 (unroll x nr) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + + + dec(rsi) // i -= 1; + jmp(.CLOOPKITER) // jump to beginning of loop. + + + + + + + label(.CCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.CLOOPKLEFT) // EDGE LOOP + + + je(.CPOSTACCUM) // if i == 0, we're done. + + prefetch(0, mem(rbx, 256)) + prefetch(0, mem(rax, 512)) + + // iteration 0 + vmovaps(mem(rax, -32*4), xmm0) + vbroadcastss(mem(rbx, -16*4), xmm4) + vfmadd231ps(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -28*4), xmm1) + vfmadd231ps(xmm1, xmm4, xmm12) + vbroadcastss(mem(rbx, -15*4), xmm5) + vfmadd231ps(xmm0, xmm5, xmm9) + vfmadd231ps(xmm1, xmm5, xmm13) + vbroadcastss(mem(rbx, -14*4), xmm6) + vfmadd231ps(xmm0, xmm6, xmm10) + vfmadd231ps(xmm1, xmm6, xmm14) + vbroadcastss(mem(rbx, -13*4), xmm7) + vfmadd231ps(xmm0, xmm7, xmm11) + vfmadd231ps(xmm1, xmm7, xmm15) + + + add(imm(1*4*8), rax) // a += 1*2 (1 x mr) + add(imm(1*2*8), rbx) // b += 1*2 (1 x nr) + + + dec(rsi) // i -= 1; + jmp(.CLOOPKLEFT) // jump to beginning of loop. + + + + label(.CPOSTACCUM) + + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c + + + vpermilps(imm(0xb1), xmm9, xmm9) + vpermilps(imm(0xb1), xmm11, xmm11) + vpermilps(imm(0xb1), xmm13, xmm13) + vpermilps(imm(0xb1), xmm15, xmm15) + + vaddsubps(xmm9, xmm8, xmm8) + vaddsubps(xmm11, xmm10, xmm10) + vaddsubps(xmm13, xmm12, xmm12) + vaddsubps(xmm15, xmm14, xmm14) + + + // xmm8: xmm10: + // ( ab00 ( ab01 + // ab10 ab11 + // ab20 ab21 + // ab30 ) ab31 ) + + // xmm12: xmm14: + // ( ab40 ( ab41 + // ab50 ab51 + // ab60 ab61 + // ab70 ) ab71 ) + + + prefetch(0, mem(r14)) // prefetch a_next + prefetch(0, mem(r14, 64)) // prefetch a_next + + + // scale by alpha + + mov(%4, rax) // load address of alpha + vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate + vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate + + vpermilps(imm(0xb1), xmm8, xmm9) + vpermilps(imm(0xb1), xmm10, xmm11) + vpermilps(imm(0xb1), xmm12, xmm13) + vpermilps(imm(0xb1), xmm14, xmm15) + + vmulps(xmm8, xmm0, xmm8) + vmulps(xmm10, xmm0, xmm10) + vmulps(xmm12, xmm0, xmm12) + vmulps(xmm14, xmm0, xmm14) + + vmulps(xmm9, xmm1, xmm9) + vmulps(xmm11, xmm1, xmm11) + vmulps(xmm13, xmm1, xmm13) + vmulps(xmm15, xmm1, xmm15) + + vaddsubps(xmm9, xmm8, xmm8) + vaddsubps(xmm11, xmm10, xmm10) + vaddsubps(xmm13, xmm12, xmm12) + vaddsubps(xmm15, xmm14, xmm14) + + + + + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate + vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate + + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) + + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + + + + prefetch(0, mem(r15)) // prefetch b_next + prefetch(0, mem(r15, 64)) // prefetch b_next + + + + // determine if + // c % 32 == 0, AND + // 8*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomiss(xmm0, xmm6) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomiss(xmm0, xmm7) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.CCOLSTORED) // jump to column storage case + + + + label(.CGENSTORED) + + + vmovlps(mem(rcx), xmm0, xmm0) // load c00:c10 + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm2, xmm2) // load c20:c30 + vmovhps(mem(rcx, r13, 1), xmm2, xmm2) + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm8, xmm0, xmm0) + vmovlps(xmm0, mem(rcx)) // store c00:c10 + vmovhps(xmm0, mem(rcx, rsi, 1)) + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm12, xmm2, xmm2) + vmovlps(xmm2, mem(rcx, r12, 1)) // store c20:c30 + vmovhps(xmm2, mem(rcx, r13, 1)) + + + + vmovlps(mem(r10), xmm0, xmm0) // load c01:c11 + vmovhps(mem(r10, rsi, 1), xmm0, xmm0) + vmovlps(mem(r10, r12, 1), xmm2, xmm2) // load c21:c31 + vmovhps(mem(r10, r13, 1), xmm2, xmm2) + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm10, xmm0, xmm0) + vmovlps(xmm0, mem(r10)) // store c01:c11 + vmovhps(xmm0, mem(r10, rsi, 1)) + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm14, xmm2, xmm2) + vmovlps(xmm2, mem(r10, r12, 1)) // store c21:c31 + vmovhps(xmm2, mem(r10, r13, 1)) + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORED) + + + vmovups(mem(rcx), xmm0) // load c00:c10 + vmovups(mem(rcx, 16), xmm2) // load c20:c30 + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm8, xmm0, xmm0) + vmovups(xmm0, mem(rcx)) // store c00:c10 + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm12, xmm2, xmm2) + vmovups(xmm2, mem(rcx, 16)) // store c20:c30 + + + + vmovups(mem(r10), xmm0) // load c01:c11 + vmovups(mem(r10, 16), xmm2) // load c21:c31 + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm10, xmm0, xmm0) + vmovups(xmm0, mem(r10)) // store c01:c11 + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm14, xmm2, xmm2) + vmovups(xmm2, mem(r10, 16)) // store c21:c31 + + + + jmp(.CDONE) // jump to end. + + + + label(.CBETAZERO) + // check if aligned/column-stored + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.CCOLSTORBZ) // jump to column storage case + + + + label(.CGENSTORBZ) + + + vmovlps(xmm8, mem(rcx)) // store c00:c10 + vmovhps(xmm8, mem(rcx, rsi, 1)) + + vmovlps(xmm12, mem(rcx, r12, 1)) // store c20:c30 + vmovhps(xmm12, mem(rcx, r13, 1)) + + vmovlps(xmm10, mem(r10)) // store c01:c11 + vmovhps(xmm10, mem(r10, rsi, 1)) + + vmovlps(xmm14, mem(r10, r12, 1)) // store c21:c31 + vmovhps(xmm14, mem(r10, r13, 1)) + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORBZ) + + + vmovups(xmm8, mem(rcx)) // store c00:c10 + vmovups(xmm12, mem(rcx, 16)) // store c20:c30 + + vmovups(xmm10, mem(r10)) // store c01:c11 + vmovups(xmm14, mem(r10, 16)) // store c21:c31 + + + + + + label(.CDONE) + : // output operands (none) : // input operands @@ -2199,491 +2202,491 @@ void bli_zgemm_piledriver_asm_2x2 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - "movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) - "leaq (,%%rdi,2), %%rdi \n\t" - "leaq (%%rcx,%%rdi,1), %%r10 \n\t" // load address of c + 1*cs_c; - " \n\t" - "addq $16 * 8, %%rax \n\t" - "addq $16 * 8, %%rbx \n\t" - " \n\t" - "vxorpd %%xmm8, %%xmm8, %%xmm8 \n\t" - "vxorpd %%xmm9, %%xmm9, %%xmm9 \n\t" - "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" - "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" - "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" - "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" - "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" - "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to k_left code. - " \n\t" - " \n\t" - "prefetcht0 256(%%rbx) \n\t" - " \n\t" - "prefetcht0 512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 0 - "vmovaps -16 * 8(%%rax), %%xmm0 \n\t" - "vmovddup -16 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -14 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup -15 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup -14 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup -13 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps -12 * 8(%%rax), %%xmm0 \n\t" - "vmovddup -12 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -10 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup -11 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup -10 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup -9 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps -8 * 8(%%rax), %%xmm0 \n\t" - "vmovddup -8 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 64+256(%%rbx) \n\t" - " \n\t" - "prefetcht0 64+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 2 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -6 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup -7 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup -6 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup -5 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps -4 * 8(%%rax), %%xmm0 \n\t" - "vmovddup -4 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -2 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup -3 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup -2 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup -1 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps 0 * 8(%%rax), %%xmm0 \n\t" - "vmovddup 0 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 128+256(%%rbx) \n\t" - " \n\t" - "prefetcht0 128+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 4 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 2 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup 1 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup 2 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup 3 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps 4 * 8(%%rax), %%xmm0 \n\t" - "vmovddup 4 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 5 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 6 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup 5 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup 6 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup 7 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps 8 * 8(%%rax), %%xmm0 \n\t" - "vmovddup 8 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - "prefetcht0 128+256(%%rbx) \n\t" - " \n\t" - "prefetcht0 128+512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 6 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 10 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup 9 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup 10 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup 11 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vmovaps 12 * 8(%%rax), %%xmm0 \n\t" - "vmovddup 12 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" // iteration 7 - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps 14 * 8(%%rax), %%xmm1 \n\t" - "addq $8 * 2 * 16, %%rax \n\t" // a += 8*2 (unroll x mr) - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup 13 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup 14 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup 15 * 8(%%rbx), %%xmm7 \n\t" - "addq $8 * 2 * 16, %%rbx \n\t" // b += 8*2 (unroll x nr) - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .ZLOOPKITER \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" - "je .ZPOSTACCUM \n\t" // if i == 0, we're done. - " \n\t" - "prefetcht0 256(%%rbx) \n\t" - " \n\t" - "prefetcht0 512(%%rax) \n\t" - " \n\t" - " \n\t" // iteration 0 - "vmovaps -16 * 8(%%rax), %%xmm0 \n\t" - "vmovddup -16 * 8(%%rbx), %%xmm4 \n\t" - "vfmadd231pd %%xmm0, %%xmm4, %%xmm8 \n\t" - "vmovaps -14 * 8(%%rax), %%xmm1 \n\t" - "vfmadd231pd %%xmm1, %%xmm4, %%xmm12 \n\t" - "vmovddup -15 * 8(%%rbx), %%xmm5 \n\t" - "vfmadd231pd %%xmm0, %%xmm5, %%xmm9 \n\t" - "vfmadd231pd %%xmm1, %%xmm5, %%xmm13 \n\t" - "vmovddup -14 * 8(%%rbx), %%xmm6 \n\t" - "vfmadd231pd %%xmm0, %%xmm6, %%xmm10 \n\t" - "vfmadd231pd %%xmm1, %%xmm6, %%xmm14 \n\t" - "vmovddup -13 * 8(%%rbx), %%xmm7 \n\t" - "vfmadd231pd %%xmm0, %%xmm7, %%xmm11 \n\t" - "vfmadd231pd %%xmm1, %%xmm7, %%xmm15 \n\t" - " \n\t" - " \n\t" - "addq $1 * 2 * 16, %%rax \n\t" // a += 1*2 (1 x mr) - "addq $1 * 2 * 16, %%rbx \n\t" // b += 1*2 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jmp .ZLOOPKLEFT \n\t" // jump to beginning of loop. - " \n\t" - " \n\t" - " \n\t" - ".ZPOSTACCUM: \n\t" - " \n\t" - " \n\t" - "prefetchw 0 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetchw 0 * 8(%%r10) \n\t" // prefetch c + 1*cs_c - " \n\t" - " \n\t" - "vpermilpd $0x1, %%xmm9, %%xmm9 \n\t" - "vpermilpd $0x1, %%xmm11, %%xmm11 \n\t" - "vpermilpd $0x1, %%xmm13, %%xmm13 \n\t" - "vpermilpd $0x1, %%xmm15, %%xmm15 \n\t" - " \n\t" - "vaddsubpd %%xmm9, %%xmm8, %%xmm8 \n\t" - "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" - "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" - "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" // xmm8: xmm10: - " \n\t" // ( ab00 ( ab01 - " \n\t" // ab10 ) ab11 ) - " \n\t" - " \n\t" // xmm12: xmm14: - " \n\t" // ( ab20 ( ab21 - " \n\t" // ab30 ) ab31 ) - " \n\t" - " \n\t" - "prefetcht0 (%%r14) \n\t" // prefetch a_next - "prefetcht0 64(%%r14) \n\t" // prefetch a_next - " \n\t" - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vmovddup (%%rax), %%xmm0 \n\t" // load alpha_r and duplicate - "vmovddup 8(%%rax), %%xmm1 \n\t" // load alpha_i and duplicate - " \n\t" - "vpermilpd $0x1, %%xmm8, %%xmm9 \n\t" - "vpermilpd $0x1, %%xmm10, %%xmm11 \n\t" - "vpermilpd $0x1, %%xmm12, %%xmm13 \n\t" - "vpermilpd $0x1, %%xmm14, %%xmm15 \n\t" - " \n\t" - "vmulpd %%xmm8, %%xmm0, %%xmm8 \n\t" - "vmulpd %%xmm10, %%xmm0, %%xmm10 \n\t" - "vmulpd %%xmm12, %%xmm0, %%xmm12 \n\t" - "vmulpd %%xmm14, %%xmm0, %%xmm14 \n\t" - " \n\t" - "vmulpd %%xmm9, %%xmm1, %%xmm9 \n\t" - "vmulpd %%xmm11, %%xmm1, %%xmm11 \n\t" - "vmulpd %%xmm13, %%xmm1, %%xmm13 \n\t" - "vmulpd %%xmm15, %%xmm1, %%xmm15 \n\t" - " \n\t" - "vaddsubpd %%xmm9, %%xmm8, %%xmm8 \n\t" - "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" - "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" - "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vmovddup (%%rbx), %%xmm6 \n\t" // load beta_r and duplicate - "vmovddup 8(%%rbx), %%xmm7 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex) - "leaq (,%%rsi,2), %%rsi \n\t" - //"leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "prefetcht0 (%%r15) \n\t" // prefetch b_next - "prefetcht0 64(%%r15) \n\t" // prefetch b_next - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 16*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (16*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" // set xmm0 to zero. - "vucomisd %%xmm0, %%xmm6 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomisd %%xmm0, %%xmm7 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups (%%rcx), %%xmm0 \n\t" // load c00 - "vmovups (%%rcx,%%rsi), %%xmm2 \n\t" // load c10 - "vpermilpd $0x1, %%xmm0, %%xmm1 \n\t" - "vpermilpd $0x1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulpd %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubpd %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%rcx) \n\t" // store c00 - " \n\t" - "vmulpd %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulpd %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubpd %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddpd %%xmm12, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, (%%rcx,%%rsi) \n\t" // store c10 - " \n\t" - " \n\t" - " \n\t" - "vmovups (%%r10), %%xmm0 \n\t" // load c01 - "vmovups (%%r10,%%rsi), %%xmm2 \n\t" // load c11 - "vpermilpd $0x1, %%xmm0, %%xmm1 \n\t" - "vpermilpd $0x1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulpd %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubpd %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%r10) \n\t" // store c01 - " \n\t" - "vmulpd %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulpd %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubpd %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddpd %%xmm14, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, (%%r10,%%rsi) \n\t" // store c11 - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups (%%rcx), %%xmm0 \n\t" // load c00 - "vmovups 16(%%rcx), %%xmm2 \n\t" // load c10 - "vpermilpd $0x1, %%xmm0, %%xmm1 \n\t" - "vpermilpd $0x1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulpd %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubpd %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%rcx) \n\t" // store c00 - " \n\t" - "vmulpd %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulpd %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubpd %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddpd %%xmm12, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, 16(%%rcx) \n\t" // store c10 - " \n\t" - " \n\t" - " \n\t" - "vmovups (%%r10), %%xmm0 \n\t" // load c01 - "vmovups 16(%%r10), %%xmm2 \n\t" // load c11 - "vpermilpd $0x1, %%xmm0, %%xmm1 \n\t" - "vpermilpd $0x1, %%xmm2, %%xmm3 \n\t" - " \n\t" - "vmulpd %%xmm6, %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm7, %%xmm1, %%xmm1 \n\t" - "vaddsubpd %%xmm1, %%xmm0, %%xmm0 \n\t" - "vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%%r10) \n\t" // store c01 - " \n\t" - "vmulpd %%xmm6, %%xmm2, %%xmm2 \n\t" - "vmulpd %%xmm7, %%xmm3, %%xmm3 \n\t" - "vaddsubpd %%xmm3, %%xmm2, %%xmm2 \n\t" - "vaddpd %%xmm14, %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, 16(%%r10) \n\t" // store c11 - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%xmm8, (%%rcx) \n\t" // store c00 - "vmovups %%xmm12, (%%rcx,%%rsi) \n\t" // store c10 - " \n\t" - "vmovups %%xmm10, (%%r10) \n\t" // store c01 - "vmovups %%xmm14, (%%r10,%%rsi) \n\t" // store c11 - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%xmm8, (%%rcx) \n\t" // store c00 - "vmovups %%xmm12, 16(%%rcx) \n\t" // store c10 - " \n\t" - "vmovups %%xmm10, (%%r10) \n\t" // store c01 - "vmovups %%xmm14, 16(%%r10) \n\t" // store c11 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZDONE: \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + mov(%10, r14) // load address of a_next. + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) + lea(mem(, rdi, 2), rdi) + lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; + + add(imm(16*8), rax) + add(imm(16*8), rbx) + + vxorpd(xmm8, xmm8, xmm8) + vxorpd(xmm9, xmm9, xmm9) + vxorpd(xmm10, xmm10, xmm10) + vxorpd(xmm11, xmm11, xmm11) + vxorpd(xmm12, xmm12, xmm12) + vxorpd(xmm13, xmm13, xmm13) + vxorpd(xmm14, xmm14, xmm14) + vxorpd(xmm15, xmm15, xmm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.ZCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.ZLOOPKITER) // MAIN LOOP + + + je(.ZCONSIDKLEFT) // if i == 0, jump to k_left code. + + + prefetch(0, mem(rbx, 256)) + + prefetch(0, mem(rax, 512)) + + // iteration 0 + vmovaps(mem(rax, -16*8), xmm0) + vmovddup(mem(rbx, -16*8), xmm4) + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -14*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, -15*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, -14*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, -13*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, -12*8), xmm0) + vmovddup(mem(rbx, -12*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + // iteration 1 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -10*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, -11*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, -10*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, -9*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, -8*8), xmm0) + vmovddup(mem(rbx, -8*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 64+256)) + + prefetch(0, mem(rax, 64+512)) + + // iteration 2 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -6*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, -7*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, -6*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, -5*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, -4*8), xmm0) + vmovddup(mem(rbx, -4*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + // iteration 3 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -2*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, -3*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, -2*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, -1*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, 0*8), xmm0) + vmovddup(mem(rbx, 0*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 128+256)) + + prefetch(0, mem(rax, 128+512)) + + // iteration 4 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 2*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, 1*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, 2*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, 3*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, 4*8), xmm0) + vmovddup(mem(rbx, 4*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + // iteration 5 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 6*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, 5*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, 6*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, 7*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, 8*8), xmm0) + vmovddup(mem(rbx, 8*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + prefetch(0, mem(rbx, 128+256)) + + prefetch(0, mem(rax, 128+512)) + + // iteration 6 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 10*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, 9*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, 10*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, 11*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vmovaps(mem(rax, 12*8), xmm0) + vmovddup(mem(rbx, 12*8), xmm4) + vfmadd231pd(xmm1, xmm7, xmm15) + + // iteration 7 + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, 14*8), xmm1) + add(imm(8*2*16), rax) // a += 8*2 (unroll x mr) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, 13*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, 14*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, 15*8), xmm7) + add(imm(8*2*16), rbx) // b += 8*2 (unroll x nr) + vfmadd231pd(xmm0, xmm7, xmm11) + vfmadd231pd(xmm1, xmm7, xmm15) + + + + dec(rsi) // i -= 1; + jmp(.ZLOOPKITER) // jump to beginning of loop. + + + + + + + label(.ZCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.ZLOOPKLEFT) // EDGE LOOP + + + je(.ZPOSTACCUM) // if i == 0, we're done. + + prefetch(0, mem(rbx, 256)) + + prefetch(0, mem(rax, 512)) + + // iteration 0 + vmovaps(mem(rax, -16*8), xmm0) + vmovddup(mem(rbx, -16*8), xmm4) + vfmadd231pd(xmm0, xmm4, xmm8) + vmovaps(mem(rax, -14*8), xmm1) + vfmadd231pd(xmm1, xmm4, xmm12) + vmovddup(mem(rbx, -15*8), xmm5) + vfmadd231pd(xmm0, xmm5, xmm9) + vfmadd231pd(xmm1, xmm5, xmm13) + vmovddup(mem(rbx, -14*8), xmm6) + vfmadd231pd(xmm0, xmm6, xmm10) + vfmadd231pd(xmm1, xmm6, xmm14) + vmovddup(mem(rbx, -13*8), xmm7) + vfmadd231pd(xmm0, xmm7, xmm11) + vfmadd231pd(xmm1, xmm7, xmm15) + + + add(imm(1*2*16), rax) // a += 1*2 (1 x mr) + add(imm(1*2*16), rbx) // b += 1*2 (1 x nr) + + + dec(rsi) // i -= 1; + jmp(.ZLOOPKLEFT) // jump to beginning of loop. + + + + label(.ZPOSTACCUM) + + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c + prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c + + + vpermilpd(imm(0x1), xmm9, xmm9) + vpermilpd(imm(0x1), xmm11, xmm11) + vpermilpd(imm(0x1), xmm13, xmm13) + vpermilpd(imm(0x1), xmm15, xmm15) + + vaddsubpd(xmm9, xmm8, xmm8) + vaddsubpd(xmm11, xmm10, xmm10) + vaddsubpd(xmm13, xmm12, xmm12) + vaddsubpd(xmm15, xmm14, xmm14) + + + // xmm8: xmm10: + // ( ab00 ( ab01 + // ab10 ) ab11 ) + + // xmm12: xmm14: + // ( ab20 ( ab21 + // ab30 ) ab31 ) + + + prefetch(0, mem(r14)) // prefetch a_next + prefetch(0, mem(r14, 64)) // prefetch a_next + + + // scale by alpha + + mov(%4, rax) // load address of alpha + vmovddup(mem(rax), xmm0) // load alpha_r and duplicate + vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate + + vpermilpd(imm(0x1), xmm8, xmm9) + vpermilpd(imm(0x1), xmm10, xmm11) + vpermilpd(imm(0x1), xmm12, xmm13) + vpermilpd(imm(0x1), xmm14, xmm15) + + vmulpd(xmm8, xmm0, xmm8) + vmulpd(xmm10, xmm0, xmm10) + vmulpd(xmm12, xmm0, xmm12) + vmulpd(xmm14, xmm0, xmm14) + + vmulpd(xmm9, xmm1, xmm9) + vmulpd(xmm11, xmm1, xmm11) + vmulpd(xmm13, xmm1, xmm13) + vmulpd(xmm15, xmm1, xmm15) + + vaddsubpd(xmm9, xmm8, xmm8) + vaddsubpd(xmm11, xmm10, xmm10) + vaddsubpd(xmm13, xmm12, xmm12) + vaddsubpd(xmm15, xmm14, xmm14) + + + + + mov(%5, rbx) // load address of beta + vmovddup(mem(rbx), xmm6) // load beta_r and duplicate + vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate + + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) + lea(mem(, rsi, 2), rsi) + //lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; + + + + + + prefetch(0, mem(r15)) // prefetch b_next + prefetch(0, mem(r15, 64)) // prefetch b_next + + + + // determine if + // c % 32 == 0, AND + // 16*cs_c % 32 == 0, AND + // rs_c == 1 + // ie: aligned, ldim aligned, and + // column-stored + + cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. + sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); + test(imm(31), rcx) // set ZF if c & 32 is zero. + setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); + test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero. + setz(al) // al = ( ZF == 0 ? 1 : 0 ); + // and(bl,bh) followed by + // and(bh,al) will reveal result + + // now avoid loading C if beta == 0 + + vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. + vucomisd(xmm0, xmm6) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomisd(xmm0, xmm7) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case + + + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.ZCOLSTORED) // jump to column storage case + + + + label(.ZGENSTORED) + + + vmovups(mem(rcx), xmm0) // load c00 + vmovups(mem(rcx, rsi, 1), xmm2) // load c10 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm8, xmm0, xmm0) + vmovups(xmm0, mem(rcx)) // store c00 + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm12, xmm2, xmm2) + vmovups(xmm2, mem(rcx, rsi, 1)) // store c10 + + + + vmovups(mem(r10), xmm0) // load c01 + vmovups(mem(r10, rsi, 1), xmm2) // load c11 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm10, xmm0, xmm0) + vmovups(xmm0, mem(r10)) // store c01 + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm14, xmm2, xmm2) + vmovups(xmm2, mem(r10, rsi, 1)) // store c11 + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORED) + + + vmovups(mem(rcx), xmm0) // load c00 + vmovups(mem(rcx, 16), xmm2) // load c10 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm8, xmm0, xmm0) + vmovups(xmm0, mem(rcx)) // store c00 + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm12, xmm2, xmm2) + vmovups(xmm2, mem(rcx, 16)) // store c10 + + + + vmovups(mem(r10), xmm0) // load c01 + vmovups(mem(r10, 16), xmm2) // load c11 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm10, xmm0, xmm0) + vmovups(xmm0, mem(r10)) // store c01 + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm14, xmm2, xmm2) + vmovups(xmm2, mem(r10, 16)) // store c11 + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZBETAZERO) + // check if aligned/column-stored + // check if aligned/column-stored + and(bl, bh) // set ZF if bl & bh == 1. + and(bh, al) // set ZF if bh & al == 1. + jne(.ZCOLSTORBZ) // jump to column storage case + + + + label(.ZGENSTORBZ) + + + vmovups(xmm8, mem(rcx)) // store c00 + vmovups(xmm12, mem(rcx, rsi, 1)) // store c10 + + vmovups(xmm10, mem(r10)) // store c01 + vmovups(xmm14, mem(r10, rsi, 1)) // store c11 + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORBZ) + + + vmovups(xmm8, mem(rcx)) // store c00 + vmovups(xmm12, mem(rcx, 16)) // store c10 + + vmovups(xmm10, mem(r10)) // store c01 + vmovups(xmm14, mem(r10, 16)) // store c11 + + + + + + label(.ZDONE) + : // output operands (none) : // input operands @@ -2709,3 +2712,4 @@ void bli_zgemm_piledriver_asm_2x2 ); } + diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c index f1d73063e..20946c3c5 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c @@ -37,6 +37,9 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + void bli_sgemm_sandybridge_asm_8x8 ( dim_t k0, @@ -61,943 +64,943 @@ void bli_sgemm_sandybridge_asm_8x8 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) - "leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c; - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c - "prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c - "prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c - "prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c - "prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c - " \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 16 * 32(%%rax) \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 18 * 32(%%rax) \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - "addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr) - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t" - "addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr) - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr) - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr) - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps %%ymm1, %%ymm0 \n\t" - "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab22 ab20 ab26 ab24 - " \n\t" // ab32 ab30 ab36 ab34 - " \n\t" // ab44 ab46 ab40 ab42 - " \n\t" // ab54 ab56 ab50 ab52 - " \n\t" // ab66 ab64 ab62 ab60 - " \n\t" // ab76 ) ab74 ) ab72 ) ab70 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab23 ab21 ab27 ab25 - " \n\t" // ab33 ab31 ab37 ab35 - " \n\t" // ab45 ab47 ab41 ab43 - " \n\t" // ab55 ab57 ab51 ab53 - " \n\t" // ab67 ab65 ab63 ab61 - " \n\t" // ab77 ) ab75 ) ab73 ) ab71 ) - " \n\t" - "vmovaps %%ymm15, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab20 ab22 ab24 ab26 - " \n\t" // ab30 ab32 ab34 ab36 - " \n\t" // ab44 ab46 ab40 ab42 - " \n\t" // ab54 ab56 ab50 ab52 - " \n\t" // ab64 ab66 ab60 ab62 - " \n\t" // ab74 ) ab76 ) ab70 ) ab72 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab21 ab23 ab25 ab27 - " \n\t" // ab31 ab33 ab35 ab37 - " \n\t" // ab45 ab47 ab41 ab43 - " \n\t" // ab55 ab57 ab51 ab53 - " \n\t" // ab65 ab67 ab61 ab63 - " \n\t" // ab75 ) ab77 ) ab71 ) ab73 ) - " \n\t" - "vmovaps %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t" - "vperm2f128 $0x12, %%ymm11, %%ymm7, %%ymm11 \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm9, %%ymm13, %%ymm13 \n\t" - "vperm2f128 $0x12, %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t" - "vperm2f128 $0x12, %%ymm10, %%ymm7, %%ymm10 \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm8, %%ymm12, %%ymm12 \n\t" - "vperm2f128 $0x12, %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 - " \n\t" // ab10 ab12 ab14 ab16 - " \n\t" // ab20 ab22 ab24 ab26 - " \n\t" // ab30 ab32 ab34 ab36 - " \n\t" // ab40 ab42 ab44 ab46 - " \n\t" // ab50 ab52 ab54 ab56 - " \n\t" // ab60 ab62 ab64 ab66 - " \n\t" // ab70 ) ab72 ) ab74 ) ab76 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 - " \n\t" // ab11 ab13 ab15 ab17 - " \n\t" // ab21 ab23 ab25 ab27 - " \n\t" // ab31 ab33 ab35 ab37 - " \n\t" // ab41 ab43 ab45 ab47 - " \n\t" // ab51 ab53 ab55 ab57 - " \n\t" // ab61 ab63 ab65 ab67 - " \n\t" // ab71 ) ab73 ) ab75 ) ab77 ) - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate - " \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" // update c00:c70 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c01:c71 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c02:c72 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c03:c73 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c04:c74 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c05:c75 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c06:c76 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c07:c77 - "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" - "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" - "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" - "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" - "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" - "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" - "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" - "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" - " \n\t" - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, - " \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70, - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm1 \n\t" // load c01:c71, - "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, - "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72, - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm1 \n\t" // load c03:c73, - "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, - "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c04:c74, - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm1 \n\t" // load c05:c75, - "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, - "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c06:c76, - "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups (%%rcx), %%ymm1 \n\t" // load c07:c77, - "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, - "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" - " \n\t" // update c00:c70 - "vmovups %%ymm15, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c01:c71 - "vmovups %%ymm14, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c02:c72 - "vmovups %%ymm13, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c03:c73 - "vmovups %%ymm12, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c04:c74 - "vmovups %%ymm11, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c05:c75 - "vmovups %%ymm10, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c06:c76 - "vmovups %%ymm9, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" // update c07:c77 - "vmovups %%ymm8, %%ymm0 \n\t" - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" - "vmovss %%xmm0, (%%rcx) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" - "vmovss %%xmm0, (%%rcx,%%r12) \n\t" - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" - "vmovss %%xmm1, (%%rcx,%%r13) \n\t" - "vmovss %%xmm2, (%%rdx) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" - "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" - "vmovss %%xmm2, (%%rdx,%%r12) \n\t" - "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" - "vmovss %%xmm3, (%%rdx,%%r13) \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm15, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm13, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm11, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm9, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. + vpermilps(imm(0x4e), ymm2, ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) + lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; + + lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rcx, r14, 1, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(r10, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c + prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c + prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c + + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 16*32)) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 0*32), ymm2) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm15, ymm6, ymm15) + vaddps(ymm13, ymm7, ymm13) + + vmovaps(mem(rax, 1*32), ymm1) + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vaddps(ymm11, ymm6, ymm11) + vaddps(ymm9, ymm7, ymm9) + + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 1*32), ymm2) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm14, ymm6, ymm14) + vaddps(ymm12, ymm7, ymm12) + + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vaddps(ymm10, ymm6, ymm10) + vaddps(ymm8, ymm7, ymm8) + + // iteration 1 + vmulps(ymm1, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm15, ymm6, ymm15) + vaddps(ymm13, ymm7, ymm13) + + vmovaps(mem(rax, 2*32), ymm0) + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddps(ymm11, ymm6, ymm11) + vaddps(ymm9, ymm7, ymm9) + + vmulps(ymm1, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 2*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm14, ymm6, ymm14) + vaddps(ymm12, ymm7, ymm12) + + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddps(ymm10, ymm6, ymm10) + vaddps(ymm8, ymm7, ymm8) + + + // iteration 2 + prefetch(0, mem(rax, 18*32)) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 2*32), ymm2) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm15, ymm6, ymm15) + vaddps(ymm13, ymm7, ymm13) + + vmovaps(mem(rax, 3*32), ymm1) + add(imm(4*8*4), rax) // a += 4*8 (unroll x mr) + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vaddps(ymm11, ymm6, ymm11) + vaddps(ymm9, ymm7, ymm9) + + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 3*32), ymm2) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm14, ymm6, ymm14) + vaddps(ymm12, ymm7, ymm12) + + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vaddps(ymm10, ymm6, ymm10) + vaddps(ymm8, ymm7, ymm8) + + + // iteration 3 + vmulps(ymm1, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 3*32), ymm2) + add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr) + vmulps(ymm1, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm15, ymm6, ymm15) + vaddps(ymm13, ymm7, ymm13) + + vmovaps(mem(rax, 0*32), ymm0) + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddps(ymm11, ymm6, ymm11) + vaddps(ymm9, ymm7, ymm9) + + vmulps(ymm1, ymm2, ymm6) + vperm2f128(imm(0x03), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 0*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vperm2f128(imm(0x03), ymm3, ymm3, ymm5) + vaddps(ymm14, ymm6, ymm14) + vaddps(ymm12, ymm7, ymm12) + + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddps(ymm10, ymm6, ymm10) + vaddps(ymm8, ymm7, ymm8) + + + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + + prefetch(0, mem(rax, 16*32)) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmovshdup(mem(rbx, 0*32), ymm2) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm15, ymm6, ymm15) + vaddps(ymm13, ymm7, ymm13) + + vmovaps(mem(rax, 1*32), ymm1) + add(imm(8*1*4), rax) // a += 8 (1 x mr) + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vaddps(ymm11, ymm6, ymm11) + vaddps(ymm9, ymm7, ymm9) + + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmovsldup(mem(rbx, 1*32), ymm2) + add(imm(8*1*4), rbx) // b += 8 (1 x nr) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm14, ymm6, ymm14) + vaddps(ymm12, ymm7, ymm12) + + vpermilps(imm(0x4e), ymm2, ymm3) + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(ymm1, ymm0) + vaddps(ymm10, ymm6, ymm10) + vaddps(ymm8, ymm7, ymm8) + + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab22 ab20 ab26 ab24 + // ab32 ab30 ab36 ab34 + // ab44 ab46 ab40 ab42 + // ab54 ab56 ab50 ab52 + // ab66 ab64 ab62 ab60 + // ab76 ) ab74 ) ab72 ) ab70 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab23 ab21 ab27 ab25 + // ab33 ab31 ab37 ab35 + // ab45 ab47 ab41 ab43 + // ab55 ab57 ab51 ab53 + // ab67 ab65 ab63 ab61 + // ab77 ) ab75 ) ab73 ) ab71 ) + + vmovaps(ymm15, ymm7) + vshufps(imm(0xe4), ymm13, ymm15, ymm15) + vshufps(imm(0xe4), ymm7, ymm13, ymm13) + + vmovaps(ymm11, ymm7) + vshufps(imm(0xe4), ymm9, ymm11, ymm11) + vshufps(imm(0xe4), ymm7, ymm9, ymm9) + + vmovaps(ymm14, ymm7) + vshufps(imm(0xe4), ymm12, ymm14, ymm14) + vshufps(imm(0xe4), ymm7, ymm12, ymm12) + + vmovaps(ymm10, ymm7) + vshufps(imm(0xe4), ymm8, ymm10, ymm10) + vshufps(imm(0xe4), ymm7, ymm8, ymm8) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab20 ab22 ab24 ab26 + // ab30 ab32 ab34 ab36 + // ab44 ab46 ab40 ab42 + // ab54 ab56 ab50 ab52 + // ab64 ab66 ab60 ab62 + // ab74 ) ab76 ) ab70 ) ab72 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab21 ab23 ab25 ab27 + // ab31 ab33 ab35 ab37 + // ab45 ab47 ab41 ab43 + // ab55 ab57 ab51 ab53 + // ab65 ab67 ab61 ab63 + // ab75 ) ab77 ) ab71 ) ab73 ) + + vmovaps(ymm15, ymm7) + vperm2f128(imm(0x30), ymm11, ymm15, ymm15) + vperm2f128(imm(0x12), ymm11, ymm7, ymm11) + + vmovaps(ymm13, ymm7) + vperm2f128(imm(0x30), ymm9, ymm13, ymm13) + vperm2f128(imm(0x12), ymm9, ymm7, ymm9) + + vmovaps(ymm14, ymm7) + vperm2f128(imm(0x30), ymm10, ymm14, ymm14) + vperm2f128(imm(0x12), ymm10, ymm7, ymm10) + + vmovaps(ymm12, ymm7) + vperm2f128(imm(0x30), ymm8, ymm12, ymm12) + vperm2f128(imm(0x12), ymm8, ymm7, ymm8) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab02 ( ab04 ( ab06 + // ab10 ab12 ab14 ab16 + // ab20 ab22 ab24 ab26 + // ab30 ab32 ab34 ab36 + // ab40 ab42 ab44 ab46 + // ab50 ab52 ab54 ab56 + // ab60 ab62 ab64 ab66 + // ab70 ) ab72 ) ab74 ) ab76 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab01 ( ab03 ( ab05 ( ab07 + // ab11 ab13 ab15 ab17 + // ab21 ab23 ab25 ab27 + // ab31 ab33 ab35 ab37 + // ab41 ab43 ab45 ab47 + // ab51 ab53 ab55 ab57 + // ab61 ab63 ab65 ab67 + // ab71 ) ab73 ) ab75 ) ab77 ) + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm4) // load beta and duplicate + + vmulps(ymm0, ymm8, ymm8) // scale by alpha + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; + + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm4) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + + + label(.SGENSTORED) + + // update c00:c70 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm15, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c01:c71 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm14, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c02:c72 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm13, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c03:c73 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm12, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c04:c74 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm11, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c05:c75 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm10, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c06:c76 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm9, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c07:c77 + vmovlps(mem(rcx), xmm0, xmm0) + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) + vmovlps(mem(rcx, r12, 1), xmm1, xmm1) + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) + vshufps(imm(0x88), xmm1, xmm0, xmm0) + vmovlps(mem(rdx), xmm2, xmm2) + vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) + vmovlps(mem(rdx, r12, 1), xmm3, xmm3) + vmovhps(mem(rdx, r13, 1), xmm3, xmm3) + vshufps(imm(0x88), xmm3, xmm2, xmm2) + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) + + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm8, ymm0, ymm0) // add the gemm result, + + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + + vmovups(mem(rcx), ymm0) // load c00:c70, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm15, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c01:c71, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm14, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c02:c72, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm13, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c03:c73, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm12, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c04:c74, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm11, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c05:c75, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm10, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c06:c76, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm9, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c07:c77, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm8, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + + + jmp(.SDONE) // jump to end. + + + + + label(.SBETAZERO) + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORBZ) // jump to column storage case + + + + label(.SGENSTORBZ) + + // update c00:c70 + vmovups(ymm15, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c01:c71 + vmovups(ymm14, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c02:c72 + vmovups(ymm13, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c03:c73 + vmovups(ymm12, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c04:c74 + vmovups(ymm11, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c05:c75 + vmovups(ymm10, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c06:c76 + vmovups(ymm9, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + add(rdi, rcx) // c += cs_c; + add(rdi, rdx) // c += cs_c; + + + // update c07:c77 + vmovups(ymm8, ymm0) + vextractf128(imm(1), ymm0, xmm2) + vmovss(xmm0, mem(rcx)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, rsi, 1)) + vpermilps(imm(0x39), xmm1, xmm0) + vmovss(xmm0, mem(rcx, r12, 1)) + vpermilps(imm(0x39), xmm0, xmm1) + vmovss(xmm1, mem(rcx, r13, 1)) + vmovss(xmm2, mem(rdx)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, rsi, 1)) + vpermilps(imm(0x39), xmm3, xmm2) + vmovss(xmm2, mem(rdx, r12, 1)) + vpermilps(imm(0x39), xmm2, xmm3) + vmovss(xmm3, mem(rdx, r13, 1)) + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORBZ) + + + vmovups(ymm15, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm14, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm13, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm12, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm11, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm8, mem(rcx)) // and store back to memory. + + + + + + label(.SDONE) + + vzeroupper() + : // output operands (none) : // input operands @@ -1047,632 +1050,632 @@ void bli_dgemm_sandybridge_asm_8x4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - //"movq %10, %%r14 \n\t" // load address of a_next. - "addq $-4 * 64, %%r15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - " \n\t" - "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4] - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 18 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 20 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4] - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - //"prefetcht0 22 * 32(%%rax) \n\t" - "prefetcht0 14 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - //"addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - //"addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 14 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" - "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab11 ab10 ab13 ab12 - " \n\t" // ab22 ab23 ab20 ab21 - " \n\t" // ab33 ) ab32 ) ab31 ) ab30 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab51 ab50 ab53 ab52 - " \n\t" // ab62 ab63 ab60 ab61 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t" - "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t" - "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t" - "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t" - "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 - " \n\t" // ab11 ab10 ab13 ab12 - " \n\t" // ab23 ab22 ab21 ab20 - " \n\t" // ab33 ) ab32 ) ab31 ) ab30 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab41 ( ab40 ( ab43 ( ab42 - " \n\t" // ab51 ab50 ab53 ab52 - " \n\t" // ab63 ab62 ab61 ab60 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm9: ymm11: ymm13: ymm15: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" - " \n\t" // ymm8: ymm10: ymm12: ymm14: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm2 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" // update c00:c33 - " \n\t" - "vextractf128 $1, %%ymm9, %%xmm1 \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c00 and c10, - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm9, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c20 and c30, - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm11, %%xmm1 \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c01 and c11, - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm11, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c21 and c31, - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm13, %%xmm1 \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c02 and c12, - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm13, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c22 and c32, - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm15, %%xmm1 \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c03 and c13, - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm15, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" - "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c23 and c33, - "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t" - " \n\t" - " \n\t" // update c40:c73 - " \n\t" - "vextractf128 $1, %%ymm8, %%xmm1 \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c40 and c50, - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c60 and c70, - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm10, %%xmm1 \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c41 and c51, - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c61 and c71, - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm12, %%xmm1 \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c42 and c52, - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm12, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c62 and c72, - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm14, %%xmm1 \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c43 and c53, - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm14, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" - "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c63 and c73, - "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t" - "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta, - "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result, - "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory. - "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" // update c00:c33 - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. - " \n\t" - " \n\t" // update c40:c73 - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73, - "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, - "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" // update c00:c33 - " \n\t" - "vextractf128 $1, %%ymm9, %%xmm1 \n\t" - "vmovlpd %%xmm9, (%%rcx) \n\t" // store to c00:c30 - "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm11, %%xmm1 \n\t" - "vmovlpd %%xmm11, (%%rcx) \n\t" // store to c01:c31 - "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm13, %%xmm1 \n\t" - "vmovlpd %%xmm13, (%%rcx) \n\t" // store to c02:c32 - "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t" - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm15, %%xmm1 \n\t" - "vmovlpd %%xmm15, (%%rcx) \n\t" // store to c03:c33 - "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t" - " \n\t" - " \n\t" // update c40:c73 - " \n\t" - "vextractf128 $1, %%ymm8, %%xmm1 \n\t" - "vmovlpd %%xmm8, (%%rdx) \n\t" // store to c40:c70 - "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm10, %%xmm1 \n\t" - "vmovlpd %%xmm10, (%%rdx) \n\t" // store to c41:c71 - "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm12, %%xmm1 \n\t" - "vmovlpd %%xmm12, (%%rdx) \n\t" // store to c42:c72 - "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t" - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vextractf128 $1, %%ymm14, %%xmm1 \n\t" - "vmovlpd %%xmm14, (%%rdx) \n\t" // store to c43:c73 - "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t" - "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t" - "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" // update c00:c33 - " \n\t" - "vmovupd %%ymm9, (%%rcx) \n\t" // store c00:c30 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm11, (%%rcx) \n\t" // store c01:c31 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm13, (%%rcx) \n\t" // store c02:c32 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm15, (%%rcx) \n\t" // store c03:c33 - " \n\t" - " \n\t" // update c40:c73 - " \n\t" - "vmovupd %%ymm8, (%%rdx) \n\t" // store c40:c70 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm10, (%%rdx) \n\t" // store c41:c71 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm12, (%%rdx) \n\t" // store c42:c72 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm14, (%%rdx) \n\t" // store c43:c73 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + //mov(%10, r14) // load address of a_next. + sub(imm(4*64), r15) + + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b. + vpermilpd(imm(0x5), ymm2, ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + + vxorpd(ymm8, ymm8, ymm8) + vxorpd(ymm9, ymm9, ymm9) + vxorpd(ymm10, ymm10, ymm10) + vxorpd(ymm11, ymm11, ymm11) + vxorpd(ymm12, ymm12, ymm12) + vxorpd(ymm13, ymm13, ymm13) + vxorpd(ymm14, ymm14, ymm14) + vxorpd(ymm15, ymm15, ymm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) + + // iteration 0 + vmovapd(mem(rax, 1*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm15, ymm6, ymm15) + vaddpd(ymm13, ymm7, ymm13) + + prefetch(0, mem(rax, 16*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovapd(mem(rbx, 1*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vpermilpd(imm(0x5), ymm2, ymm3) + vaddpd(ymm14, ymm6, ymm14) + vaddpd(ymm12, ymm7, ymm12) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 2*32), ymm0) + vaddpd(ymm11, ymm6, ymm11) + vaddpd(ymm9, ymm7, ymm9) + prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4] + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddpd(ymm10, ymm6, ymm10) + vaddpd(ymm8, ymm7, ymm8) + + + // iteration 1 + vmovapd(mem(rax, 3*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm15, ymm6, ymm15) + vaddpd(ymm13, ymm7, ymm13) + + prefetch(0, mem(rax, 18*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovapd(mem(rbx, 2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vpermilpd(imm(0x5), ymm2, ymm3) + vaddpd(ymm14, ymm6, ymm14) + vaddpd(ymm12, ymm7, ymm12) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 4*32), ymm0) + vaddpd(ymm11, ymm6, ymm11) + vaddpd(ymm9, ymm7, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddpd(ymm10, ymm6, ymm10) + vaddpd(ymm8, ymm7, ymm8) + + + // iteration 2 + vmovapd(mem(rax, 5*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm15, ymm6, ymm15) + vaddpd(ymm13, ymm7, ymm13) + + prefetch(0, mem(rax, 20*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovapd(mem(rbx, 3*32), ymm2) + add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) + vmulpd(ymm1, ymm3, ymm7) + vpermilpd(imm(0x5), ymm2, ymm3) + vaddpd(ymm14, ymm6, ymm14) + vaddpd(ymm12, ymm7, ymm12) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 6*32), ymm0) + vaddpd(ymm11, ymm6, ymm11) + vaddpd(ymm9, ymm7, ymm9) + prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddpd(ymm10, ymm6, ymm10) + vaddpd(ymm8, ymm7, ymm8) + + + // iteration 3 + vmovapd(mem(rax, 7*32), ymm1) + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm15, ymm6, ymm15) + vaddpd(ymm13, ymm7, ymm13) + + //prefetch(0, mem(rax, 22*32)) + prefetch(0, mem(rax, 14*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovapd(mem(rbx, 0*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vpermilpd(imm(0x5), ymm2, ymm3) + vaddpd(ymm14, ymm6, ymm14) + vaddpd(ymm12, ymm7, ymm12) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 0*32), ymm0) + vaddpd(ymm11, ymm6, ymm11) + vaddpd(ymm9, ymm7, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddpd(ymm10, ymm6, ymm10) + vaddpd(ymm8, ymm7, ymm8) + + + + //add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + //add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + vmovapd(mem(rax, 1*32), ymm1) + add(imm(8*1*8), rax) // a += 8 (1 x mr) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm15, ymm6, ymm15) + vaddpd(ymm13, ymm7, ymm13) + + prefetch(0, mem(rax, 14*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovapd(mem(rbx, 1*32), ymm2) + add(imm(4*1*8), rbx) // b += 4 (1 x nr) + vmulpd(ymm1, ymm3, ymm7) + vpermilpd(imm(0x5), ymm2, ymm3) + vaddpd(ymm14, ymm6, ymm14) + vaddpd(ymm12, ymm7, ymm12) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 0*32), ymm0) + vaddpd(ymm11, ymm6, ymm11) + vaddpd(ymm9, ymm7, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddpd(ymm10, ymm6, ymm10) + vaddpd(ymm8, ymm7, ymm8) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab11 ab10 ab13 ab12 + // ab22 ab23 ab20 ab21 + // ab33 ) ab32 ) ab31 ) ab30 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab51 ab50 ab53 ab52 + // ab62 ab63 ab60 ab61 + // ab73 ) ab72 ) ab71 ) ab70 ) + + vmovapd(ymm15, ymm7) + vshufpd(imm(0xa), ymm15, ymm13, ymm15) + vshufpd(imm(0xa), ymm13, ymm7, ymm13) + + vmovapd(ymm11, ymm7) + vshufpd(imm(0xa), ymm11, ymm9, ymm11) + vshufpd(imm(0xa), ymm9, ymm7, ymm9) + + vmovapd(ymm14, ymm7) + vshufpd(imm(0xa), ymm14, ymm12, ymm14) + vshufpd(imm(0xa), ymm12, ymm7, ymm12) + + vmovapd(ymm10, ymm7) + vshufpd(imm(0xa), ymm10, ymm8, ymm10) + vshufpd(imm(0xa), ymm8, ymm7, ymm8) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab01 ( ab00 ( ab03 ( ab02 + // ab11 ab10 ab13 ab12 + // ab23 ab22 ab21 ab20 + // ab33 ) ab32 ) ab31 ) ab30 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab41 ( ab40 ( ab43 ( ab42 + // ab51 ab50 ab53 ab52 + // ab63 ab62 ab61 ab60 + // ab73 ) ab72 ) ab71 ) ab70 ) + + vmovapd(ymm15, ymm7) + vperm2f128(imm(0x30), ymm15, ymm11, ymm15) + vperm2f128(imm(0x12), ymm7, ymm11, ymm11) + + vmovapd(ymm13, ymm7) + vperm2f128(imm(0x30), ymm13, ymm9, ymm13) + vperm2f128(imm(0x12), ymm7, ymm9, ymm9) + + vmovapd(ymm14, ymm7) + vperm2f128(imm(0x30), ymm14, ymm10, ymm14) + vperm2f128(imm(0x12), ymm7, ymm10, ymm10) + + vmovapd(ymm12, ymm7) + vperm2f128(imm(0x30), ymm12, ymm8, ymm12) + vperm2f128(imm(0x12), ymm7, ymm8, ymm8) + + // ymm9: ymm11: ymm13: ymm15: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ) ab31 ) ab32 ) ab33 ) + + // ymm8: ymm10: ymm12: ymm14: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate + + vmulpd(ymm0, ymm8, ymm8) // scale by alpha + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm0, ymm14, ymm14) + vmulpd(ymm0, ymm15, ymm15) + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm2) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DGENSTORED) + // update c00:c33 + + vextractf128(imm(1), ymm9, xmm1) + vmovlpd(mem(rcx), xmm0, xmm0) // load c00 and c10, + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm9, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20 and c30, + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm11, xmm1) + vmovlpd(mem(rcx), xmm0, xmm0) // load c01 and c11, + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm11, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21 and c31, + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm13, xmm1) + vmovlpd(mem(rcx), xmm0, xmm0) // load c02 and c12, + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm13, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22 and c32, + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm15, xmm1) + vmovlpd(mem(rcx), xmm0, xmm0) // load c03 and c13, + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm15, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, rsi, 1)) + vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c23 and c33, + vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rcx, r13, 1)) + + // update c40:c73 + + vextractf128(imm(1), ymm8, xmm1) + vmovlpd(mem(rdx), xmm0, xmm0) // load c40 and c50, + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm8, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60 and c70, + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm10, xmm1) + vmovlpd(mem(rdx), xmm0, xmm0) // load c41 and c51, + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm10, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61 and c71, + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm12, xmm1) + vmovlpd(mem(rdx), xmm0, xmm0) // load c42 and c52, + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm12, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62 and c72, + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm14, xmm1) + vmovlpd(mem(rdx), xmm0, xmm0) // load c43 and c53, + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm14, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, rsi, 1)) + vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c63 and c73, + vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) + vmulpd(xmm2, xmm0, xmm0) // scale by beta, + vaddpd(xmm1, xmm0, xmm0) // add the gemm result, + vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. + vmovhpd(xmm0, mem(rdx, r13, 1)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + // update c00:c33 + + vmovupd(mem(rcx), ymm0) // load c00:c30, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm9, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c01:c31, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm11, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c02:c32, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm13, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c03:c33, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm15, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + + // update c40:c73 + + vmovupd(mem(rdx), ymm0) // load c40:c70, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm8, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c41:c71, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm10, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c42:c72, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm12, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c43:c73, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm14, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + + + jmp(.DDONE) // jump to end. + + + + + label(.DBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DGENSTORBZ) + // update c00:c33 + + vextractf128(imm(1), ymm9, xmm1) + vmovlpd(xmm9, mem(rcx)) // store to c00:c30 + vmovhpd(xmm9, mem(rcx, rsi, 1)) + vmovlpd(xmm1, mem(rcx, r12, 1)) + vmovhpd(xmm1, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm11, xmm1) + vmovlpd(xmm11, mem(rcx)) // store to c01:c31 + vmovhpd(xmm11, mem(rcx, rsi, 1)) + vmovlpd(xmm1, mem(rcx, r12, 1)) + vmovhpd(xmm1, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm13, xmm1) + vmovlpd(xmm13, mem(rcx)) // store to c02:c32 + vmovhpd(xmm13, mem(rcx, rsi, 1)) + vmovlpd(xmm1, mem(rcx, r12, 1)) + vmovhpd(xmm1, mem(rcx, r13, 1)) + add(rdi, rcx) // c += cs_c; + + vextractf128(imm(1), ymm15, xmm1) + vmovlpd(xmm15, mem(rcx)) // store to c03:c33 + vmovhpd(xmm15, mem(rcx, rsi, 1)) + vmovlpd(xmm1, mem(rcx, r12, 1)) + vmovhpd(xmm1, mem(rcx, r13, 1)) + + // update c40:c73 + + vextractf128(imm(1), ymm8, xmm1) + vmovlpd(xmm8, mem(rdx)) // store to c40:c70 + vmovhpd(xmm8, mem(rdx, rsi, 1)) + vmovlpd(xmm1, mem(rdx, r12, 1)) + vmovhpd(xmm1, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm10, xmm1) + vmovlpd(xmm10, mem(rdx)) // store to c41:c71 + vmovhpd(xmm10, mem(rdx, rsi, 1)) + vmovlpd(xmm1, mem(rdx, r12, 1)) + vmovhpd(xmm1, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm12, xmm1) + vmovlpd(xmm12, mem(rdx)) // store to c42:c72 + vmovhpd(xmm12, mem(rdx, rsi, 1)) + vmovlpd(xmm1, mem(rdx, r12, 1)) + vmovhpd(xmm1, mem(rdx, r13, 1)) + add(rdi, rdx) // c += cs_c; + + vextractf128(imm(1), ymm14, xmm1) + vmovlpd(xmm14, mem(rdx)) // store to c43:c73 + vmovhpd(xmm14, mem(rdx, rsi, 1)) + vmovlpd(xmm1, mem(rdx, r12, 1)) + vmovhpd(xmm1, mem(rdx, r13, 1)) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + // update c00:c33 + + vmovupd(ymm9, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm11, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm13, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm15, mem(rcx)) // store c03:c33 + + // update c40:c73 + + vmovupd(ymm8, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm10, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm12, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm14, mem(rdx)) // store c43:c73 + + + + + + label(.DDONE) + + vzeroupper() + + vzeroupper() + : // output operands (none) : // input operands @@ -1722,918 +1725,918 @@ void bli_cgemm_sandybridge_asm_8x4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - "movq %9, %%r15 \n\t" // load address of b_next. - //"movq %10, %%r14 \n\t" // load address of a_next. - "addq $-4 * 64, %%r15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - " \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".CLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 8 * 32(%%rax) \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" - "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4] - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "prefetcht0 10 * 32(%%rax) \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 4 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 12 * 32(%%rax) \n\t" - "vmovaps 5 * 32(%%rax), %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" - "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4] - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 6 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "prefetcht0 14 * 32(%%rax) \n\t" - "vmovaps 7 * 32(%%rax), %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 4 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 8 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $8 * 4 * 8, %%rax \n\t" // a += 8*4 (unroll x mr) - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".CLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 8 * 32(%%rax) \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" - "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" - "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t" - "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t" - "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) - "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab21 ab20 ab23 ab22 - " \n\t" // ab31 ab30 ab33 ab32 - " \n\t" // ab42 ab43 ab40 ab41 - " \n\t" // ab52 ab53 ab50 ab51 - " \n\t" // ab63 ab62 ab61 ab60 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba1 aba0 aba3 aba2 - " \n\t" // abb1 abb0 abb3 abb2 - " \n\t" // abc2 abc3 abc0 abc1 - " \n\t" // abd2 abd3 abd0 abd1 - " \n\t" // abe3 abe2 abe1 abe0 - " \n\t" // abf3 abf2 abf1 abf0 ) - " \n\t" - "vmovaps %%ymm15, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm7 \n\t" - "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t" - "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ab31 ab32 ab33 - " \n\t" // ab42 ab43 ab40 ab41 - " \n\t" // ab52 ab53 ab50 ab51 - " \n\t" // ab62 ab63 ab60 ab61 - " \n\t" // ab72 ) ab73 ) ab70 ) ab71 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba0 aba1 aba2 aba3 - " \n\t" // abb0 abb1 abb2 abb3 - " \n\t" // abc2 abc3 abc0 abc1 - " \n\t" // abd2 abd3 abd0 abd1 - " \n\t" // abe2 abe3 abe0 abe1 - " \n\t" // abf2 ) abf3 ) abf0 ) abf1 ) - " \n\t" - "vmovaps %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm15, %%ymm11, %%ymm15 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm13, %%ymm9, %%ymm13 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm14, %%ymm10, %%ymm14 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm12, %%ymm8, %%ymm12 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ab31 ab32 ab33 - " \n\t" // ab40 ab41 ab42 ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83 - " \n\t" // ab90 ab91 ab92 ab93 - " \n\t" // aba0 aba1 aba2 aba3 - " \n\t" // abb0 abb1 abb2 abb3 - " \n\t" // abc0 abc1 abc2 abc3 - " \n\t" // abd0 abd1 abd2 abd3 - " \n\t" // abe0 abe1 abe2 abe3 - " \n\t" // abf0 ) abf1 ) abf2 ) abf3 ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastss (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate - "vbroadcastss 4(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate - " \n\t" - "vpermilps $0xb1, %%ymm15, %%ymm3 \n\t" - "vmulps %%ymm7, %%ymm15, %%ymm15 \n\t" - "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm15, %%ymm15 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm14, %%ymm2 \n\t" - "vmulps %%ymm7, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm13, %%ymm1 \n\t" - "vmulps %%ymm7, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubps %%ymm1, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm12, %%ymm0 \n\t" - "vmulps %%ymm7, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubps %%ymm0, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm11, %%ymm3 \n\t" - "vmulps %%ymm7, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm10, %%ymm2 \n\t" - "vmulps %%ymm7, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm9, %%ymm1 \n\t" - "vmulps %%ymm7, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubps %%ymm1, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm8, %%ymm0 \n\t" - "vmulps %%ymm7, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubps %%ymm0, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate - "vbroadcastss 4(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; - "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomiss %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .CCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORED: \n\t" - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c00,10) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c20,30) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c40,50) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c60,70) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c00,c10) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c80,90) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca0,b0) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc0,d0) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce0,f0) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c80,c90) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c01,11) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c21,31) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c41,51) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c61,71) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c01,c11) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c81,91) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca1,b1) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc1,d1) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce1,f1) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c81,c91) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c02,12) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c22,32) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c42,52) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c62,72) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c02,c12) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c82,92) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca2,b2) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc2,d2) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce2,f2) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c82,c92) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - " \n\t" - "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c03,13) into xmm0[0:1] - "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c23,33) into xmm0[2:3] - "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c43,53) into xmm2[0:1] - "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c63,73) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c03,c13) - "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - " \n\t" - "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c83,93) into xmm0[0:1] - "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca3,b3) into xmm0[2:3] - "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc3,d3) into xmm2[0:1] - "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce3,f3) into xmm2[2:3] - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2) - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c83,c93) - "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORED: \n\t" - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rcx) \n\t" // store c00:c70 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - " \n\t" - "vmovups (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rdx) \n\t" // store c80:cf0 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rcx) \n\t" // store c01:c71 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - " \n\t" - "vmovups (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rdx) \n\t" // store c81:cf1 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rcx) \n\t" // store c02:c72 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - " \n\t" - "vmovups (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rdx) \n\t" // store c82:cf2 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - " \n\t" - "vmovups (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rcx) \n\t" // store c03:c73 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - " \n\t" - "vmovups (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 - "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovups %%ymm0, (%%rdx) \n\t" // store c83:cf3 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .CCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORBZ: \n\t" - " \n\t" - " \n\t" // update c00:c70 - " \n\t" - "vextractf128 $1, %%ymm15, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm15, (%%rcx) \n\t" // store (c00,c10) - "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c80:cf0 - " \n\t" - "vextractf128 $1, %%ymm14, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm14, (%%rdx) \n\t" // store (c80,c90) - "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t" // store (ca0,cb0) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c71 - " \n\t" - "vextractf128 $1, %%ymm13, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm13, (%%rcx) \n\t" // store (c01,c11) - "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c81:cf1 - " \n\t" - "vextractf128 $1, %%ymm12, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm12, (%%rdx) \n\t" // store (c81,c91) - "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" // store (ca1,cb1) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c72 - " \n\t" - "vextractf128 $1, %%ymm11, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm11, (%%rcx) \n\t" // store (c02,c12) - "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c82:cf2 - " \n\t" - "vextractf128 $1, %%ymm10, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm10, (%%rdx) \n\t" // store (c82,c92) - "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" // store (ca2,cb2) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c73 - " \n\t" - "vextractf128 $1, %%ymm9, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm9, (%%rcx) \n\t" // store (c03,c13) - "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53) - "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c83:cf3 - " \n\t" - "vextractf128 $1, %%ymm8, %%xmm2 \n\t" // xmm2 := ymm0[4:7] - "vmovlpd %%xmm8, (%%rdx) \n\t" // store (c83,c93) - "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t" // store (ca3,cb3) - "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3) - "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm15, (%%rcx) \n\t" // store c00:c70 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm14, (%%rdx) \n\t" // store c80:cf0 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm13, (%%rcx) \n\t" // store c01:c71 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm12, (%%rdx) \n\t" // store c81:cf1 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm11, (%%rcx) \n\t" // store c02:c72 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm10, (%%rdx) \n\t" // store c82:cf2 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm9, (%%rcx) \n\t" // store c03:c73 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovups %%ymm8, (%%rdx) \n\t" // store c83:cf3 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + mov(%9, r15) // load address of b_next. + //mov(%10, r14) // load address of a_next. + sub(imm(4*64), r15) + + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovsldup(mem(rbx, 0*32), ymm2) + vpermilps(imm(0x4e), ymm2, ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + + vxorps(ymm8, ymm8, ymm8) + vxorps(ymm9, ymm9, ymm9) + vxorps(ymm10, ymm10, ymm10) + vxorps(ymm11, ymm11, ymm11) + vxorps(ymm12, ymm12, ymm12) + vxorps(ymm13, ymm13, ymm13) + vxorps(ymm14, ymm14, ymm14) + vxorps(ymm15, ymm15, ymm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.CLOOPKITER) // MAIN LOOP + + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) + + // iteration 0 + prefetch(0, mem(rax, 8*32)) + vmovaps(mem(rax, 1*32), ymm1) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm15, ymm15) + vaddps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovshdup(mem(rbx, 0*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddps(ymm6, ymm14, ymm14) + vaddps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vpermilps(imm(0xb1), ymm0, ymm0) + vaddps(ymm6, ymm11, ymm11) + vaddps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm10, ymm10) + vaddps(ymm7, ymm8, ymm8) + prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4] + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 2*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + // iteration 1 + prefetch(0, mem(rax, 10*32)) + vmovaps(mem(rax, 3*32), ymm1) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm15, ymm15) + vaddps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovshdup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddps(ymm6, ymm14, ymm14) + vaddps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vpermilps(imm(0xb1), ymm0, ymm0) + vaddps(ymm6, ymm11, ymm11) + vaddps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm10, ymm10) + vaddps(ymm7, ymm8, ymm8) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 2*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 4*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + // iteration 2 + prefetch(0, mem(rax, 12*32)) + vmovaps(mem(rax, 5*32), ymm1) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm15, ymm15) + vaddps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovshdup(mem(rbx, 2*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddps(ymm6, ymm14, ymm14) + vaddps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vpermilps(imm(0xb1), ymm0, ymm0) + vaddps(ymm6, ymm11, ymm11) + vaddps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm10, ymm10) + vaddps(ymm7, ymm8, ymm8) + prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 3*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 6*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + // iteration 3 + prefetch(0, mem(rax, 14*32)) + vmovaps(mem(rax, 7*32), ymm1) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm15, ymm15) + vaddps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovshdup(mem(rbx, 3*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddps(ymm6, ymm14, ymm14) + vaddps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vpermilps(imm(0xb1), ymm0, ymm0) + vaddps(ymm6, ymm11, ymm11) + vaddps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm10, ymm10) + vaddps(ymm7, ymm8, ymm8) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 4*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 8*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + add(imm(8*4*8), rax) // a += 8*4 (unroll x mr) + add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) + + + dec(rsi) // i -= 1; + jne(.CLOOPKITER) // iterate again if i != 0. + + + + + + + label(.CCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.CLOOPKLEFT) // EDGE LOOP + + // iteration 0 + prefetch(0, mem(rax, 8*32)) + vmovaps(mem(rax, 1*32), ymm1) + vmulps(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm15, ymm15) + vaddps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovshdup(mem(rbx, 0*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddps(ymm6, ymm14, ymm14) + vaddps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vpermilps(imm(0xb1), ymm0, ymm0) + vaddps(ymm6, ymm11, ymm11) + vaddps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulps(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddps(ymm6, ymm10, ymm10) + vaddps(ymm7, ymm8, ymm8) + + vpermilps(imm(0xb1), ymm1, ymm1) + vmulps(ymm0, ymm2, ymm6) + vmulps(ymm0, ymm3, ymm7) + vaddsubps(ymm6, ymm15, ymm15) + vaddsubps(ymm7, ymm13, ymm13) + + vmulps(ymm1, ymm2, ymm6) + vmovsldup(mem(rbx, 1*32), ymm2) + vmulps(ymm1, ymm3, ymm7) + vpermilps(imm(0x4e), ymm2, ymm3) + vaddsubps(ymm6, ymm14, ymm14) + vaddsubps(ymm7, ymm12, ymm12) + + vmulps(ymm0, ymm4, ymm6) + vmulps(ymm0, ymm5, ymm7) + vmovaps(mem(rax, 2*32), ymm0) + vaddsubps(ymm6, ymm11, ymm11) + vaddsubps(ymm7, ymm9, ymm9) + + vmulps(ymm1, ymm4, ymm6) + vmulps(ymm1, ymm5, ymm7) + vaddsubps(ymm6, ymm10, ymm10) + vaddsubps(ymm7, ymm8, ymm8) + + + add(imm(8*1*8), rax) // a += 8 (1 x mr) + add(imm(4*1*8), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.CLOOPKLEFT) // iterate again if i != 0. + + + + label(.CPOSTACCUM) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ab30 ab33 ab32 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab63 ab62 ab61 ab60 + // ab73 ) ab72 ) ab71 ) ab70 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba1 aba0 aba3 aba2 + // abb1 abb0 abb3 abb2 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe3 abe2 abe1 abe0 + // abf3 abf2 abf1 abf0 ) + + vmovaps(ymm15, ymm7) + vshufps(imm(0xe4), ymm13, ymm15, ymm15) + vshufps(imm(0xe4), ymm7, ymm13, ymm13) + + vmovaps(ymm11, ymm7) + vshufps(imm(0xe4), ymm9, ymm11, ymm11) + vshufps(imm(0xe4), ymm7, ymm9, ymm9) + + vmovaps(ymm14, ymm7) + vshufps(imm(0xe4), ymm12, ymm14, ymm14) + vshufps(imm(0xe4), ymm7, ymm12, ymm12) + + vmovaps(ymm10, ymm7) + vshufps(imm(0xe4), ymm8, ymm10, ymm10) + vshufps(imm(0xe4), ymm7, ymm8, ymm8) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab62 ab63 ab60 ab61 + // ab72 ) ab73 ) ab70 ) ab71 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe2 abe3 abe0 abe1 + // abf2 ) abf3 ) abf0 ) abf1 ) + + vmovaps(ymm15, ymm7) + vperm2f128(imm(0x12), ymm15, ymm11, ymm15) + vperm2f128(imm(0x30), ymm7, ymm11, ymm11) + + vmovaps(ymm13, ymm7) + vperm2f128(imm(0x12), ymm13, ymm9, ymm13) + vperm2f128(imm(0x30), ymm7, ymm9, ymm9) + + vmovaps(ymm14, ymm7) + vperm2f128(imm(0x12), ymm14, ymm10, ymm14) + vperm2f128(imm(0x30), ymm7, ymm10, ymm10) + + vmovaps(ymm12, ymm7) + vperm2f128(imm(0x12), ymm12, ymm8, ymm12) + vperm2f128(imm(0x30), ymm7, ymm8, ymm8) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab40 ab41 ab42 ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc0 abc1 abc2 abc3 + // abd0 abd1 abd2 abd3 + // abe0 abe1 abe2 abe3 + // abf0 ) abf1 ) abf2 ) abf3 ) + + + + + // scale by alpha + + mov(%4, rax) // load address of alpha + vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate + vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate + + vpermilps(imm(0xb1), ymm15, ymm3) + vmulps(ymm7, ymm15, ymm15) + vmulps(ymm6, ymm3, ymm3) + vaddsubps(ymm3, ymm15, ymm15) + + vpermilps(imm(0xb1), ymm14, ymm2) + vmulps(ymm7, ymm14, ymm14) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm14, ymm14) + + vpermilps(imm(0xb1), ymm13, ymm1) + vmulps(ymm7, ymm13, ymm13) + vmulps(ymm6, ymm1, ymm1) + vaddsubps(ymm1, ymm13, ymm13) + + vpermilps(imm(0xb1), ymm12, ymm0) + vmulps(ymm7, ymm12, ymm12) + vmulps(ymm6, ymm0, ymm0) + vaddsubps(ymm0, ymm12, ymm12) + + vpermilps(imm(0xb1), ymm11, ymm3) + vmulps(ymm7, ymm11, ymm11) + vmulps(ymm6, ymm3, ymm3) + vaddsubps(ymm3, ymm11, ymm11) + + vpermilps(imm(0xb1), ymm10, ymm2) + vmulps(ymm7, ymm10, ymm10) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm10, ymm10) + + vpermilps(imm(0xb1), ymm9, ymm1) + vmulps(ymm7, ymm9, ymm9) + vmulps(ymm6, ymm1, ymm1) + vaddsubps(ymm1, ymm9, ymm9) + + vpermilps(imm(0xb1), ymm8, ymm0) + vmulps(ymm7, ymm8, ymm8) + vmulps(ymm6, ymm0, ymm0) + vaddsubps(ymm0, ymm8, ymm8) + + + + + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate + vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate + + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; + lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; + + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm7) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomiss(xmm0, xmm6) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.CCOLSTORED) // jump to column storage case + + + + label(.CGENSTORED) + + // update c00:c70 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c00,c10) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c80,c90) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c01,c11) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c81,c91) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c02,c12) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c82,c92) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + + vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1] + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3] + vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1] + vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rcx)) // store (c03,c13) + vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + + vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1] + vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3] + vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1] + vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3] + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm0, mem(rdx)) // store (c83,c93) + vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) + add(rdi, rdx) // c += cs_c; + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORED) + + // update c00:c70 + + vmovups(mem(rcx), ymm0) // load c00:c70 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vmovups(mem(rdx), ymm0) // load c80:f0 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + // update c00:c70 + + vmovups(mem(rcx), ymm0) // load c01:c71 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vmovups(mem(rdx), ymm0) // load c81:f1 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + + vmovups(mem(rcx), ymm0) // load c02:c72 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + + vmovups(mem(rdx), ymm0) // load c82:f2 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + + vmovups(mem(rcx), ymm0) // load c03:c73 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + + vmovups(mem(rdx), ymm0) // load c83:f3 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + + + + jmp(.CDONE) // jump to end. + + + + label(.CBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.CCOLSTORBZ) // jump to column storage case + + + + label(.CGENSTORBZ) + + // update c00:c70 + + vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm15, mem(rcx)) // store (c00,c10) + vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm14, mem(rdx)) // store (c80,c90) + vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) + add(rdi, rdx) // c += cs_c; + + // update c01:c71 + + vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm13, mem(rcx)) // store (c01,c11) + vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm12, mem(rdx)) // store (c81,c91) + vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + + vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm11, mem(rcx)) // store (c02,c12) + vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + + vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm10, mem(rdx)) // store (c82,c92) + vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + + vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm9, mem(rcx)) // store (c03,c13) + vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33) + vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) + vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + + vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7] + vmovlpd(xmm8, mem(rdx)) // store (c83,c93) + vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3) + vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) + vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) + add(rdi, rdx) // c += cs_c; + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORBZ) + + + vmovups(ymm15, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm14, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm13, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm12, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm11, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm10, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm9, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm8, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + + + + + + label(.CDONE) + + vzeroupper() + + vzeroupper() + : // output operands (none) : // input operands @@ -2685,805 +2688,805 @@ void bli_zgemm_sandybridge_asm_4x4 __asm__ volatile ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - //"movq %10, %%r14 \n\t" // load address of a_next. - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovddup 0 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vmovddup 0 + 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) - "leaq (,%%rdi,2), %%rdi \n\t" - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c; - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c - " \n\t" - "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t" - "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "prefetcht0 18 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 8 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 8 + 3 * 32(%%rbx), %%ymm3 \n\t" - "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 4 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 5 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "prefetcht0 20 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 8 + 4 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 8 + 5 * 32(%%rbx), %%ymm3 \n\t" - "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 6 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 7 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "prefetcht0 22 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 8 + 6 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 8 + 7 * 32(%%rbx), %%ymm3 \n\t" - "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 8 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 9 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 8 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) - "addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t" - "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t" - "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t" - "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t" - "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t" - "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - "addq $4 * 1 * 16, %%rax \n\t" // a += 4 (1 x mr) - "addq $4 * 1 * 16, %%rbx \n\t" // b += 4 (1 x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".ZPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab21 ab20 ab23 ab22 - " \n\t" // ab31 ) ab30 ) ab33 ) ab32 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab61 ab60 ab63 ab62 - " \n\t" // ab71 ) ab70 ) ab73 ) ab72 ) - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm15, %%ymm13, %%ymm15 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm11, %%ymm9, %%ymm11 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm14, %%ymm12, %%ymm14 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm7 \n\t" - "vperm2f128 $0x12, %%ymm10, %%ymm8, %%ymm10 \n\t" - "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" - " \n\t" // scale by alpha - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastsd (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate - "vbroadcastsd 8(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate - " \n\t" - "vpermilpd $0x5, %%ymm15, %%ymm3 \n\t" - "vmulpd %%ymm7, %%ymm15, %%ymm15 \n\t" - "vmulpd %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm15, %%ymm15 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm14, %%ymm2 \n\t" - "vmulpd %%ymm7, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm13, %%ymm1 \n\t" - "vmulpd %%ymm7, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubpd %%ymm1, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm12, %%ymm0 \n\t" - "vmulpd %%ymm7, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubpd %%ymm0, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm11, %%ymm3 \n\t" - "vmulpd %%ymm7, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm6, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm10, %%ymm2 \n\t" - "vmulpd %%ymm7, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm9, %%ymm1 \n\t" - "vmulpd %%ymm7, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm6, %%ymm1, %%ymm1 \n\t" - "vaddsubpd %%ymm1, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm8, %%ymm0 \n\t" - "vmulpd %%ymm7, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm6, %%ymm0, %%ymm0 \n\t" - "vaddsubpd %%ymm0, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate - "vbroadcastsd 8(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex) - "leaq (,%%rsi,2), %%rsi \n\t" - "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomisd %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. - "jz .ZCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORED: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c00,c10) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c20,c30) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c00,c10) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c40,c50) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c60,c70) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c40,c50) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c01,c11) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c21,c31) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c01,c11) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c41,c51) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c61,c71) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c41,c51) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c02,c12) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c22,c32) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c02,c12) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c42,c52) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c62,c72) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c42,c52) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vmovupd (%%rcx), %%xmm0 \n\t" // load (c03,c13) into xmm0 - "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c23,c33) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rcx) \n\t" // store (c03,c13) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vmovupd (%%rdx), %%xmm0 \n\t" // load (c43,c53) into xmm0 - "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c63,c73) into xmm2 - "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2) - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3] - "vmovupd %%xmm0, (%%rdx) \n\t" // store (c43,c53) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73) - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORED: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rcx) \n\t" // store c00:c30 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rdx) \n\t" // store c40:c70 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rcx) \n\t" // store c01:c31 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rdx) \n\t" // store c41:c71 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rcx) \n\t" // store c02:c32 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rdx) \n\t" // store c42:c72 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rcx) \n\t" // store c03:c33 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 - "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta - "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" - "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" - "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovupd %%ymm0, (%%rdx) \n\t" // store c43:c73 - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZBETAZERO: \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. - "jz .ZCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORBZ: \n\t" - " \n\t" // update c00:c30 - " \n\t" - "vextractf128 $1, %%ymm15, %%xmm2 \n\t" - "vmovupd %%xmm15, (%%rcx) \n\t" // store (c00,c10) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c40:c70 - " \n\t" - "vextractf128 $1, %%ymm14, %%xmm2 \n\t" - "vmovupd %%xmm14, (%%rdx) \n\t" // store (c40,c50) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c01:c31 - " \n\t" - "vextractf128 $1, %%ymm13, %%xmm2 \n\t" - "vmovupd %%xmm13, (%%rcx) \n\t" // store (c01,c11) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c41:c71 - " \n\t" - "vextractf128 $1, %%ymm12, %%xmm2 \n\t" - "vmovupd %%xmm12, (%%rdx) \n\t" // store (c41,c51) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c02:c32 - " \n\t" - "vextractf128 $1, %%ymm11, %%xmm2 \n\t" - "vmovupd %%xmm11, (%%rcx) \n\t" // store (c02,c12) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c42:c72 - " \n\t" - "vextractf128 $1, %%ymm10, %%xmm2 \n\t" - "vmovupd %%xmm10, (%%rdx) \n\t" // store (c42,c52) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72) - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c03:c33 - " \n\t" - "vextractf128 $1, %%ymm9, %%xmm2 \n\t" - "vmovupd %%xmm9, (%%rcx) \n\t" // store (c03,c13) - "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33) - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" // update c43:c73 - " \n\t" - "vextractf128 $1, %%ymm8, %%xmm2 \n\t" - "vmovupd %%xmm8, (%%rdx) \n\t" // store (c43,c53) - "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73) - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm15, (%%rcx) \n\t" // store c00:c30 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm14, (%%rdx) \n\t" // store c40:c70 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm13, (%%rcx) \n\t" // store c01:c31 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm12, (%%rdx) \n\t" // store c41:c71 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm11, (%%rcx) \n\t" // store c02:c32 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm10, (%%rdx) \n\t" // store c42:c72 - "addq %%rdi, %%rdx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm9, (%%rcx) \n\t" // store c03:c33 - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - "vmovupd %%ymm8, (%%rdx) \n\t" // store c43:c73 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + //mov(%10, r14) // load address of a_next. + + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading + vmovddup(mem(rbx, 0+0*32), ymm2) + vmovddup(mem(rbx, 0+1*32), ymm3) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) + lea(mem(, rdi, 2), rdi) + lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; + + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c + prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c + + vxorpd(ymm8, ymm8, ymm8) + vxorpd(ymm9, ymm9, ymm9) + vxorpd(ymm10, ymm10, ymm10) + vxorpd(ymm11, ymm11, ymm11) + vxorpd(ymm12, ymm12, ymm12) + vxorpd(ymm13, ymm13, ymm13) + vxorpd(ymm14, ymm14, ymm14) + vxorpd(ymm15, ymm15, ymm15) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.ZCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.ZLOOPKITER) // MAIN LOOP + + + // iteration 0 + vmovapd(mem(rax, 1*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm15, ymm15) + vaddpd(ymm7, ymm11, ymm11) + + prefetch(0, mem(rax, 16*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 8+0*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 8+1*32), ymm3) + vaddpd(ymm6, ymm14, ymm14) + vaddpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vpermilpd(imm(0x5), ymm0, ymm0) + vaddpd(ymm6, ymm13, ymm13) + vaddpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm12, ymm12) + vaddpd(ymm7, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm1, ymm1) + vmulpd(ymm0, ymm2, ymm6) + vmulpd(ymm0, ymm3, ymm7) + vaddsubpd(ymm6, ymm15, ymm15) + vaddsubpd(ymm7, ymm11, ymm11) + + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+3*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 2*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + // iteration 1 + vmovapd(mem(rax, 3*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm15, ymm15) + vaddpd(ymm7, ymm11, ymm11) + + prefetch(0, mem(rax, 18*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 8+2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 8+3*32), ymm3) + vaddpd(ymm6, ymm14, ymm14) + vaddpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vpermilpd(imm(0x5), ymm0, ymm0) + vaddpd(ymm6, ymm13, ymm13) + vaddpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm12, ymm12) + vaddpd(ymm7, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm1, ymm1) + vmulpd(ymm0, ymm2, ymm6) + vmulpd(ymm0, ymm3, ymm7) + vaddsubpd(ymm6, ymm15, ymm15) + vaddsubpd(ymm7, ymm11, ymm11) + + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+4*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+5*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 4*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + // iteration 2 + vmovapd(mem(rax, 5*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm15, ymm15) + vaddpd(ymm7, ymm11, ymm11) + + prefetch(0, mem(rax, 20*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 8+4*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 8+5*32), ymm3) + vaddpd(ymm6, ymm14, ymm14) + vaddpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vpermilpd(imm(0x5), ymm0, ymm0) + vaddpd(ymm6, ymm13, ymm13) + vaddpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm12, ymm12) + vaddpd(ymm7, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm1, ymm1) + vmulpd(ymm0, ymm2, ymm6) + vmulpd(ymm0, ymm3, ymm7) + vaddsubpd(ymm6, ymm15, ymm15) + vaddsubpd(ymm7, ymm11, ymm11) + + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+6*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+7*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 6*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + // iteration 3 + vmovapd(mem(rax, 7*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm15, ymm15) + vaddpd(ymm7, ymm11, ymm11) + + prefetch(0, mem(rax, 22*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 8+6*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 8+7*32), ymm3) + vaddpd(ymm6, ymm14, ymm14) + vaddpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vpermilpd(imm(0x5), ymm0, ymm0) + vaddpd(ymm6, ymm13, ymm13) + vaddpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm12, ymm12) + vaddpd(ymm7, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm1, ymm1) + vmulpd(ymm0, ymm2, ymm6) + vmulpd(ymm0, ymm3, ymm7) + vaddsubpd(ymm6, ymm15, ymm15) + vaddsubpd(ymm7, ymm11, ymm11) + + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+8*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+9*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 8*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) + add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKITER) // iterate again if i != 0. + + + + + + + label(.ZCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.ZLOOPKLEFT) // EDGE LOOP + + // iteration 0 + vmovapd(mem(rax, 1*32), ymm1) + vmulpd(ymm0, ymm2, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm0, ymm3, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm15, ymm15) + vaddpd(ymm7, ymm11, ymm11) + + prefetch(0, mem(rax, 16*32)) + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 8+0*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 8+1*32), ymm3) + vaddpd(ymm6, ymm14, ymm14) + vaddpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vpermilpd(imm(0x5), ymm0, ymm0) + vaddpd(ymm6, ymm13, ymm13) + vaddpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vperm2f128(imm(0x3), ymm2, ymm2, ymm4) + vmulpd(ymm1, ymm5, ymm7) + vperm2f128(imm(0x3), ymm3, ymm3, ymm5) + vaddpd(ymm6, ymm12, ymm12) + vaddpd(ymm7, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm1, ymm1) + vmulpd(ymm0, ymm2, ymm6) + vmulpd(ymm0, ymm3, ymm7) + vaddsubpd(ymm6, ymm15, ymm15) + vaddsubpd(ymm7, ymm11, ymm11) + + vmulpd(ymm1, ymm2, ymm6) + vmovddup(mem(rbx, 0+2*32), ymm2) + vmulpd(ymm1, ymm3, ymm7) + vmovddup(mem(rbx, 0+3*32), ymm3) + vaddsubpd(ymm6, ymm14, ymm14) + vaddsubpd(ymm7, ymm10, ymm10) + + vmulpd(ymm0, ymm4, ymm6) + vmulpd(ymm0, ymm5, ymm7) + vmovapd(mem(rax, 2*32), ymm0) + vaddsubpd(ymm6, ymm13, ymm13) + vaddsubpd(ymm7, ymm9, ymm9) + + vmulpd(ymm1, ymm4, ymm6) + vmulpd(ymm1, ymm5, ymm7) + vaddsubpd(ymm6, ymm12, ymm12) + vaddsubpd(ymm7, ymm8, ymm8) + + + add(imm(4*1*16), rax) // a += 4 (1 x mr) + add(imm(4*1*16), rbx) // b += 4 (1 x nr) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKLEFT) // iterate again if i != 0. + + + + label(.ZPOSTACCUM) + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ) ab30 ) ab33 ) ab32 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab61 ab60 ab63 ab62 + // ab71 ) ab70 ) ab73 ) ab72 ) + + + vmovapd(ymm15, ymm7) + vperm2f128(imm(0x12), ymm15, ymm13, ymm15) + vperm2f128(imm(0x30), ymm7, ymm13, ymm13) + + vmovapd(ymm11, ymm7) + vperm2f128(imm(0x12), ymm11, ymm9, ymm11) + vperm2f128(imm(0x30), ymm7, ymm9, ymm9) + + vmovapd(ymm14, ymm7) + vperm2f128(imm(0x12), ymm14, ymm12, ymm14) + vperm2f128(imm(0x30), ymm7, ymm12, ymm12) + + vmovapd(ymm10, ymm7) + vperm2f128(imm(0x12), ymm10, ymm8, ymm10) + vperm2f128(imm(0x30), ymm7, ymm8, ymm8) + + + // ymm15: ymm13: ymm11: ymm9: + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ) ab31 ) ab32 ) ab33 ) + + // ymm14: ymm12: ymm10: ymm8: + // ( ab40 ( ab41 ( ab42 ( ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 + // ab70 ) ab71 ) ab72 ) ab73 ) + + + // scale by alpha + + mov(%4, rax) // load address of alpha + vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate + vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate + + vpermilpd(imm(0x5), ymm15, ymm3) + vmulpd(ymm7, ymm15, ymm15) + vmulpd(ymm6, ymm3, ymm3) + vaddsubpd(ymm3, ymm15, ymm15) + + vpermilpd(imm(0x5), ymm14, ymm2) + vmulpd(ymm7, ymm14, ymm14) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm14, ymm14) + + vpermilpd(imm(0x5), ymm13, ymm1) + vmulpd(ymm7, ymm13, ymm13) + vmulpd(ymm6, ymm1, ymm1) + vaddsubpd(ymm1, ymm13, ymm13) + + vpermilpd(imm(0x5), ymm12, ymm0) + vmulpd(ymm7, ymm12, ymm12) + vmulpd(ymm6, ymm0, ymm0) + vaddsubpd(ymm0, ymm12, ymm12) + + vpermilpd(imm(0x5), ymm11, ymm3) + vmulpd(ymm7, ymm11, ymm11) + vmulpd(ymm6, ymm3, ymm3) + vaddsubpd(ymm3, ymm11, ymm11) + + vpermilpd(imm(0x5), ymm10, ymm2) + vmulpd(ymm7, ymm10, ymm10) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm10, ymm10) + + vpermilpd(imm(0x5), ymm9, ymm1) + vmulpd(ymm7, ymm9, ymm9) + vmulpd(ymm6, ymm1, ymm1) + vaddsubpd(ymm1, ymm9, ymm9) + + vpermilpd(imm(0x5), ymm8, ymm0) + vmulpd(ymm7, ymm8, ymm8) + vmulpd(ymm6, ymm0, ymm0) + vaddsubpd(ymm0, ymm8, ymm8) + + + + + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate + vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate + + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) + lea(mem(, rsi, 2), rsi) + lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm7) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomisd(xmm0, xmm6) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case + + + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. + jz(.ZCOLSTORED) // jump to column storage case + + + + label(.ZGENSTORED) + // update c00:c30 + + vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c00,c10) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c40,c50) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c01,c11) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c41,c51) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c02,c12) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c42,c52) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0 + vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rcx)) // store (c03,c13) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0 + vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2 + vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] + vmovupd(xmm0, mem(rdx)) // store (c43,c53) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORED) + // update c00:c30 + + vmovupd(mem(rcx), ymm0) // load c00:c30 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vmovupd(mem(rdx), ymm0) // load c40:c70 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vmovupd(mem(rcx), ymm0) // load c01:c31 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vmovupd(mem(rdx), ymm0) // load c41:c71 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vmovupd(mem(rcx), ymm0) // load c02:c32 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vmovupd(mem(rdx), ymm0) // load c42:c72 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vmovupd(mem(rcx), ymm0) // load c03:c33 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vmovupd(mem(rdx), ymm0) // load c43:c73 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c43:c73 + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZBETAZERO) + + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. + jz(.ZCOLSTORBZ) // jump to column storage case + + + + label(.ZGENSTORBZ) + // update c00:c30 + + vextractf128(imm(1), ymm15, xmm2) + vmovupd(xmm15, mem(rcx)) // store (c00,c10) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vextractf128(imm(1), ymm14, xmm2) + vmovupd(xmm14, mem(rdx)) // store (c40,c50) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vextractf128(imm(1), ymm13, xmm2) + vmovupd(xmm13, mem(rcx)) // store (c01,c11) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vextractf128(imm(1), ymm12, xmm2) + vmovupd(xmm12, mem(rdx)) // store (c41,c51) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vextractf128(imm(1), ymm11, xmm2) + vmovupd(xmm11, mem(rcx)) // store (c02,c12) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vextractf128(imm(1), ymm10, xmm2) + vmovupd(xmm10, mem(rdx)) // store (c42,c52) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vextractf128(imm(1), ymm9, xmm2) + vmovupd(xmm9, mem(rcx)) // store (c03,c13) + vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vextractf128(imm(1), ymm8, xmm2) + vmovupd(xmm8, mem(rdx)) // store (c43,c53) + vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORBZ) + + + vmovupd(ymm15, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm14, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm13, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm12, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm11, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm10, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm9, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm8, mem(rdx)) // store c43:c73 + + + + + + label(.ZDONE) + + vzeroupper() + + vzeroupper() + : // output operands (none) : // input operands @@ -3509,3 +3512,4 @@ void bli_zgemm_sandybridge_asm_4x4 ); } + diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index fed8c97e5..54bdb9e1d 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -34,7 +34,8 @@ #include "blis.h" -#include "bli_avx512_macros.h" +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" #define A_L1_PREFETCH_DIST 4 //should be multiple of 2 @@ -305,8 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2( const int64_t rs_c = rs_c_; const int64_t cs_c = cs_c_; - __asm__ volatile - ( + BEGIN_ASM VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) @@ -525,6 +525,7 @@ void bli_dgemm_skx_asm_16x12_l2( VZEROUPPER() + END_ASM( : // output operands : // input operands [k] "m" (k), @@ -543,5 +544,5 @@ void bli_dgemm_skx_asm_16x12_l2( "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" - ); + ) } diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c index 83b3f1c83..a879ada00 100644 --- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -34,7 +34,8 @@ #include "blis.h" -#include "bli_avx512_macros.h" +#define BLIS_ASM_SYNTAX_INTEL +#include "bli_x86_asm_macros.h" #define CACHELINE_SIZE 64 //size of cache line in bytes @@ -335,8 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2( const int64_t rs_c = rs_c_; const int64_t cs_c = cs_c_; - __asm__ volatile - ( + BEGIN_ASM VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) @@ -550,6 +550,7 @@ void bli_sgemm_skx_asm_32x12_l2( VZEROUPPER() + END_ASM( : // output operands : // input operands [k] "m" (k), @@ -568,5 +569,5 @@ void bli_sgemm_skx_asm_32x12_l2( "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" - ); + ) } diff --git a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c b/kernels/zen/3/bli_gemm_zen_asm_d6x8.c index 33a9d20ca..c06ff6f90 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemm_zen_asm_d6x8.c @@ -35,15 +35,17 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" #define SGEMM_INPUT_GS_BETA_NZ \ - "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + vmovlps(mem(rcx), xmm0, xmm0) \ + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlps(mem(rcx, rsi, 2), xmm1, xmm1) \ + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) \ + vshufps(imm(0x88), xmm1, xmm0, xmm0) \ + vmovlps(mem(rcx, rsi, 4), xmm2, xmm2) \ + vmovhps(mem(rcx, r15, 1), xmm2, xmm2) \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss @@ -51,29 +53,29 @@ (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ - "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ - "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ - "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + vmovss(mem(rcx, r10, 1), xmm1) \ + vpermilps(imm(0xcf), xmm1, xmm1) \ + vmovlps(mem(rcx, r13, 2), xmm1, xmm1) \ + /*vmovhps(mem(rcx, r10, 1), xmm1, xmm1)*/ \ + vshufps(imm(0x88), xmm1, xmm2, xmm2) \ + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) #define SGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ - "vmovss %%xmm0, (%%rcx ) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ - "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ - "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ - "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" + vextractf128(imm(1), ymm0, xmm2) \ + vmovss(xmm0, mem(rcx)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, rsi, 1)) \ + vpermilps(imm(0x39), xmm1, xmm0) \ + vmovss(xmm0, mem(rcx, rsi, 2)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, r13, 1)) \ + vmovss(xmm2, mem(rcx, rsi, 4)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r15, 1)) \ + vpermilps(imm(0x39), xmm1, xmm2) \ + vmovss(xmm2, mem(rcx, r13, 2)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemm_zen_asm_6x16 ( @@ -99,776 +101,776 @@ void bli_sgemm_zen_asm_6x16 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,4), %%rdi \n\t" // rs_c *= sizeof(float) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 4, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 16 * 4, %%rbx \n\t" // b += 4*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 4, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 16 * 4, %%rbx \n\t" // b += 1*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = cs_c * sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c + 8*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - "cmpq $4, %%rdi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%7, rdi) // load rs_c + lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, -2*32), ymm0) + vmovaps(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rax, 6*4), ymm2) + vbroadcastss(mem(rax, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 8*4), ymm2) + vbroadcastss(mem(rax, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 10*4), ymm2) + vbroadcastss(mem(rax, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 0*32), ymm0) + vmovaps(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*4)) + + vbroadcastss(mem(rax, 12*4), ymm2) + vbroadcastss(mem(rax, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 14*4), ymm2) + vbroadcastss(mem(rax, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 16*4), ymm2) + vbroadcastss(mem(rax, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 2*32), ymm0) + vmovaps(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rax, 18*4), ymm2) + vbroadcastss(mem(rax, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 20*4), ymm2) + vbroadcastss(mem(rax, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 22*4), ymm2) + vbroadcastss(mem(rax, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) + add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) + add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm3) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + + + + + + mov(%8, rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) + + lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SROWSTORED) // jump to row storage case + + + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + + + label(.SGENSTORED) + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm4, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm6, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm8, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm10, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm12, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm14, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c - " \n\t" - " \n\t" + //add(rdi, rcx) // c += rs_c; + + + mov(rdx, rcx) // rcx = c + 8*cs_c + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm5, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm7, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm9, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm11, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm13, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm15, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SROWSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vbroadcastss (%%rbx), %%ymm3 \n\t" - " \n\t" - "vunpcklps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpcklps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vfmadd231ps (%%rcx ),%%xmm3, %%xmm0 \n\t" - "vfmadd231ps (%%rcx,%%rsi,4),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%rsi,1),%%xmm3, %%xmm1 \n\t" - "vfmadd231ps (%%rcx,%%r15 ),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%rsi,2),%%xmm3, %%xmm0 \n\t" - "vfmadd231ps (%%rcx,%%r13,2),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%r13 ),%%xmm3, %%xmm1 \n\t" - "vfmadd231ps (%%rcx,%%r10 ),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd (%%r14 ), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%rsi,1), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd (%%r14,%%rsi,4), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r15 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm2 \n\t" - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - " \n\t" - "vunpckhps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd (%%r14,%%rsi,2), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r13 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm0, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd (%%r14,%%r13,2), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r10 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm2 \n\t" - "vmovlpd %%xmm2, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm2, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - "leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - "vunpcklps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpcklps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vfmadd231ps (%%rcx ),%%xmm3, %%xmm0 \n\t" - "vfmadd231ps (%%rcx,%%rsi,4),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%rsi,1),%%xmm3, %%xmm1 \n\t" - "vfmadd231ps (%%rcx,%%r15 ),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%rsi,2),%%xmm3, %%xmm0 \n\t" - "vfmadd231ps (%%rcx,%%r13,2),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vfmadd231ps (%%rcx,%%r13 ),%%xmm3, %%xmm1 \n\t" - "vfmadd231ps (%%rcx,%%r10 ),%%xmm3, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - //"leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd (%%r14 ), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%rsi,1), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd (%%r14,%%rsi,4), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r15 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm2 \n\t" - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - " \n\t" - "vunpckhps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd (%%r14,%%rsi,2), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r13 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm0 \n\t" - "vmovlpd %%xmm0, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm0, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd (%%r14,%%r13,2), %%xmm1, %%xmm1 \n\t" - "vmovhpd (%%r14,%%r10 ), %%xmm1, %%xmm1 \n\t" - "vfmadd231ps %%xmm1, %%xmm3, %%xmm2 \n\t" - "vmovlpd %%xmm2, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm2, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - //"leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SROWSTORBZ \n\t" // jump to row storage case - " \n\t" - "cmpq $4, %%rdi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + //add(rdi, rcx) // c += rs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SROWSTORED) + + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm5) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm7) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm9) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm11) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm13) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm15) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + + vbroadcastss(mem(rbx), ymm3) + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) + vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14), xmm1, xmm1) + vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) + vmovhpd(mem(r14, r15, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) + vmovhpd(mem(r14, r13, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(mem(r14, r13, 2), xmm1, xmm1) + vmovhpd(mem(r14, r10, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) + vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14), xmm1, xmm1) + vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) + vmovhpd(mem(r14, r15, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) + vmovhpd(mem(r14, r13, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(mem(r14, r13, 2), xmm1, xmm1) + vmovhpd(mem(r14, r10, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + jmp(.SDONE) // jump to end. + + + + label(.SBETAZERO) + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SROWSTORBZ) // jump to row storage case + + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORBZ) // jump to column storage case + + + + label(.SGENSTORBZ) + + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + //add(rdi, rcx) // c += rs_c; + + + mov(rdx, rcx) // rcx = c + 8*cs_c + + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vunpcklps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpcklps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - " \n\t" - "vunpckhps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd %%xmm0, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm0, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd %%xmm2, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm2, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - "leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - "vunpcklps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpcklps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - " \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm2 \n\t" - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - //"leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - " \n\t" - "vunpckhps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vmovlpd %%xmm0, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm0, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd %%xmm2, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm2, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - //"leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" - " \n\t" + //add(rdi, rcx) // c += rs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SROWSTORBZ) + + + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORBZ) + + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + + + label(.SDONE) + + : // output operands (none) : // input operands @@ -898,28 +900,28 @@ void bli_sgemm_zen_asm_6x16 #define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ + vmovlpd(mem(rcx), xmm0, xmm0) \ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) \ + vmovhpd(mem(rcx, r13, 1), xmm1, xmm1) \ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0) /*\ + vmovlpd(mem(rcx, rsi, 4), xmm2, xmm2) \ + vmovhpd(mem(rcx, r15, 1), xmm2, xmm2) \ + vmovlpd(mem(rcx, r13, 2), xmm1, xmm1) \ + vmovhpd(mem(rcx, r10, 1), xmm1, xmm1) \ + vperm2f128(imm(0x20), ymm1, ymm2, ymm2)*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ + vextractf128(imm(1), ymm0, xmm1) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm1, mem(rcx, rsi, 2)) \ + vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ + vextractf128(imm(1), ymm2, xmm1) \ + vmovlpd(xmm2, mem(rcx, rsi, 4)) \ + vmovhpd(xmm2, mem(rcx, r15, 1)) \ + vmovlpd(xmm1, mem(rcx, r13, 2)) \ + vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemm_zen_asm_6x8 ( @@ -945,669 +947,669 @@ void bli_dgemm_zen_asm_6x8 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - "cmpq $8, %%rdi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%7, rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, -2*32), ymm0) + vmovapd(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rax, 6*8), ymm2) + vbroadcastsd(mem(rax, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 8*8), ymm2) + vbroadcastsd(mem(rax, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 10*8), ymm2) + vbroadcastsd(mem(rax, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 0*32), ymm0) + vmovapd(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*8)) + + vbroadcastsd(mem(rax, 12*8), ymm2) + vbroadcastsd(mem(rax, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 14*8), ymm2) + vbroadcastsd(mem(rax, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 16*8), ymm2) + vbroadcastsd(mem(rax, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 2*32), ymm0) + vmovapd(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rax, 18*8), ymm2) + vbroadcastsd(mem(rax, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 20*8), ymm2) + vbroadcastsd(mem(rax, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 22*8), ymm2) + vbroadcastsd(mem(rax, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) + add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) + add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm0, ymm14, ymm14) + vmulpd(ymm0, ymm15, ymm15) + + + + + + + mov(%8, rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DROWSTORED) // jump to row storage case + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DGENSTORED) + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm4, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm6, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm8, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm10, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm12, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm14, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" + + + mov(rdx, rcx) // rcx = c + 4*cs_c + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm5, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm7, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm9, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm11, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm13, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += rs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm15, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhpd %%ymm6, %%ymm4, %%ymm1 \n\t" - "vunpcklpd %%ymm10, %%ymm8, %%ymm2 \n\t" - "vunpckhpd %%ymm10, %%ymm8, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm4 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm6 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm8 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm10 \n\t" - " \n\t" - "vbroadcastsd (%%rbx), %%ymm3 \n\t" - " \n\t" - "vfmadd231pd (%%rcx ),%%ymm3, %%ymm4 \n\t" - "vfmadd231pd (%%rcx,%%rsi ),%%ymm3, %%ymm6 \n\t" - "vfmadd231pd (%%rcx,%%rsi,2),%%ymm3, %%ymm8 \n\t" - "vfmadd231pd (%%rcx,%%r13 ),%%ymm3, %%ymm10 \n\t" - "vmovupd %%ymm4, (%%rcx ) \n\t" - "vmovupd %%ymm6, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm8, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm10, (%%rcx,%%r13 ) \n\t" - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhpd %%ymm14, %%ymm12, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm4 \n\t" - " \n\t" - "vfmadd231pd (%%r14 ),%%xmm3, %%xmm0 \n\t" - "vfmadd231pd (%%r14,%%rsi ),%%xmm3, %%xmm1 \n\t" - "vfmadd231pd (%%r14,%%rsi,2),%%xmm3, %%xmm2 \n\t" - "vfmadd231pd (%%r14,%%r13 ),%%xmm3, %%xmm4 \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm4, (%%r14,%%r13 ) \n\t" - " \n\t" - "leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhpd %%ymm7, %%ymm5, %%ymm1 \n\t" - "vunpcklpd %%ymm11, %%ymm9, %%ymm2 \n\t" - "vunpckhpd %%ymm11, %%ymm9, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm5 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm7 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm9 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd (%%rbx), %%ymm3 \n\t" - " \n\t" - "vfmadd231pd (%%rcx ),%%ymm3, %%ymm5 \n\t" - "vfmadd231pd (%%rcx,%%rsi ),%%ymm3, %%ymm7 \n\t" - "vfmadd231pd (%%rcx,%%rsi,2),%%ymm3, %%ymm9 \n\t" - "vfmadd231pd (%%rcx,%%r13 ),%%ymm3, %%ymm11 \n\t" - "vmovupd %%ymm5, (%%rcx ) \n\t" - "vmovupd %%ymm7, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm9, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm11, (%%rcx,%%r13 ) \n\t" - " \n\t" - //"leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhpd %%ymm15, %%ymm13, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm4 \n\t" - " \n\t" - "vfmadd231pd (%%r14 ),%%xmm3, %%xmm0 \n\t" - "vfmadd231pd (%%r14,%%rsi ),%%xmm3, %%xmm1 \n\t" - "vfmadd231pd (%%r14,%%rsi,2),%%xmm3, %%xmm2 \n\t" - "vfmadd231pd (%%r14,%%r13 ),%%xmm3, %%xmm4 \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm4, (%%r14,%%r13 ) \n\t" - " \n\t" - //"leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORBZ \n\t" // jump to row storage case - " \n\t" - "cmpq $8, %%rdi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + + + + jmp(.DDONE) // jump to end. + + + + label(.DROWSTORED) + + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm5) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm6) + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm7) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm8) + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm9) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm10) + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm11) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm12) + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm13) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm14) + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm15) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + + vbroadcastsd(mem(rbx), ymm3) + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) + vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10) + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) + + lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vfmadd231pd(mem(r14), xmm3, xmm0) + vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + lea(mem(r14, rsi, 4), r14) + + + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + + vbroadcastsd(mem(rbx), ymm3) + + vfmadd231pd(mem(rcx), ymm3, ymm5) + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) + vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11) + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) + + //lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vfmadd231pd(mem(r14), xmm3, xmm0) + vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + //lea(mem(r14, rsi, 4), r14) + + + + jmp(.DDONE) // jump to end. + + + + label(.DBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DROWSTORBZ) // jump to row storage case + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DGENSTORBZ) + + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + + + mov(rdx, rcx) // rcx = c + 4*cs_c + + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c += rs_c; + + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhpd %%ymm6, %%ymm4, %%ymm1 \n\t" - "vunpcklpd %%ymm10, %%ymm8, %%ymm2 \n\t" - "vunpckhpd %%ymm10, %%ymm8, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm4 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm6 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm8 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm10 \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx ) \n\t" - "vmovupd %%ymm6, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm8, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm10, (%%rcx,%%r13 ) \n\t" - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhpd %%ymm14, %%ymm12, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm4 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm4, (%%r14,%%r13 ) \n\t" - " \n\t" - "leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhpd %%ymm7, %%ymm5, %%ymm1 \n\t" - "vunpcklpd %%ymm11, %%ymm9, %%ymm2 \n\t" - "vunpckhpd %%ymm11, %%ymm9, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm5 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm7 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm9 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm11 \n\t" - " \n\t" - "vmovupd %%ymm5, (%%rcx ) \n\t" - "vmovupd %%ymm7, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm9, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm11, (%%rcx,%%r13 ) \n\t" - " \n\t" - //"leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhpd %%ymm15, %%ymm13, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm4 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm4, (%%r14,%%r13 ) \n\t" - " \n\t" - //"leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - " \n\t" + + + + jmp(.DDONE) // jump to end. + + + + label(.DROWSTORBZ) + + + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) + + lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + lea(mem(r14, rsi, 4), r14) + + + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) + + //lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + //lea(mem(r14, rsi, 4), r14) + + + + label(.DDONE) + + : // output operands (none) : // input operands @@ -1639,33 +1641,33 @@ void bli_dgemm_zen_asm_6x8 // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ - "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ - "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ - "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovlpd(mem(rcx), xmm0, xmm0) \ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ + vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ + vinsertf128(imm(1), xmm3, ymm0, ymm0) \ + vpermilps(imm(0xb1), ymm0, ymm3) \ + vmulps(ymm1, ymm0, ymm0) \ + vmulps(ymm2, ymm3, ymm3) \ + vaddsubps(ymm3, ymm0, ymm0) // assumes values to output are in ymm0 #define CGEMM_OUTPUT_GS \ - "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ - "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" + vextractf128(imm(1), ymm0, xmm3) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm3, mem(rcx, rsi, 2)) \ + vmovhpd(xmm3, mem(rcx, r13, 1)) #define CGEMM_INPUT_SCALE_RS_BETA_NZ \ - "vmovups (%%rcx), %%ymm0 \n\t" \ - "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ - "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovups(mem(rcx), ymm0) \ + vpermilps(imm(0xb1), ymm0, ymm3) \ + vmulps(ymm1, ymm0, ymm0) \ + vmulps(ymm2, ymm3, ymm3) \ + vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_RS \ - "vmovups %%ymm0, (%%rcx) \n\t" \ + vmovups(ymm0, mem(rcx)) \ void bli_cgemm_zen_asm_3x8 ( @@ -1691,455 +1693,455 @@ void bli_cgemm_zen_asm_3x8 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(scomplex) - " \n\t" - "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; - "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".CLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 32 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 38 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 3 * 8, %%rax \n\t" // a += 4*3 (unroll x mr) - "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".CLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 32 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 3 * 8, %%rax \n\t" // a += 1*3 (unroll x mr) - "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" // permute even and odd elements - " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 - "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" - "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" - "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" - "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" - "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" // subtract/add even/odd elements - "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" - "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" - " \n\t" - "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" - "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" - "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate - "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate - "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(scomplex) - "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*cs_c; - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .CROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%7, rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex) + + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; + lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.CLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 32*8)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, -2*32), ymm0) + vmovaps(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rax, 6*4), ymm2) + vbroadcastss(mem(rax, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 8*4), ymm2) + vbroadcastss(mem(rax, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 10*4), ymm2) + vbroadcastss(mem(rax, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 0*32), ymm0) + vmovaps(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 38*8)) + + vbroadcastss(mem(rax, 12*4), ymm2) + vbroadcastss(mem(rax, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 14*4), ymm2) + vbroadcastss(mem(rax, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 16*4), ymm2) + vbroadcastss(mem(rax, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 2*32), ymm0) + vmovaps(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rax, 18*4), ymm2) + vbroadcastss(mem(rax, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 20*4), ymm2) + vbroadcastss(mem(rax, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 22*4), ymm2) + vbroadcastss(mem(rax, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*3*8), rax) // a += 4*3 (unroll x mr) + add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.CLOOPKITER) // iterate again if i != 0. + + + + + + + label(.CCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.CLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 32*8)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*3*8), rax) // a += 1*3 (unroll x mr) + add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.CLOOPKLEFT) // iterate again if i != 0. + + + + label(.CPOSTACCUM) + + + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 + vpermilps(imm(0xb1), ymm6, ymm6) + vpermilps(imm(0xb1), ymm7, ymm7) + vpermilps(imm(0xb1), ymm10, ymm10) + vpermilps(imm(0xb1), ymm11, ymm11) + vpermilps(imm(0xb1), ymm14, ymm14) + vpermilps(imm(0xb1), ymm15, ymm15) + + + // subtract/add even/odd elements + vaddsubps(ymm6, ymm4, ymm4) + vaddsubps(ymm7, ymm5, ymm5) + + vaddsubps(ymm10, ymm8, ymm8) + vaddsubps(ymm11, ymm9, ymm9) + + vaddsubps(ymm14, ymm12, ymm12) + vaddsubps(ymm15, ymm13, ymm13) + + + + + mov(%4, rax) // load address of alpha + vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate + vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate + + + vpermilps(imm(0xb1), ymm4, ymm3) + vmulps(ymm0, ymm4, ymm4) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm4, ymm4) + + vpermilps(imm(0xb1), ymm5, ymm3) + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm5, ymm5) + + + vpermilps(imm(0xb1), ymm8, ymm3) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm8, ymm8) + + vpermilps(imm(0xb1), ymm9, ymm3) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm9, ymm9) + + + vpermilps(imm(0xb1), ymm12, ymm3) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm12, ymm12) + + vpermilps(imm(0xb1), ymm13, ymm3) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm13, ymm13) + + + + + + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate + vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate + + + + + mov(%8, rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex) + lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c; + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + + + + // now avoid loading C if beta == 0 + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.CROWSTORED) // jump to row storage case + + + + label(.CGENSTORED) + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CROWSTORED: \n\t" - " \n\t" - " \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CROWSTORED) + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_RS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_RS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*cs_c; + + CGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_RS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .CROWSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.CROWSTORBZ) // jump to row storage case + + + + label(.CGENSTORBZ) + + + vmovaps(ymm4, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovaps(ymm5, ymm0) CGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + + vmovaps(ymm8, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovaps(ymm9, ymm0) CGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + + vmovaps(ymm12, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovaps(ymm13, ymm0) CGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" - " \n\t" - "vmovups %%ymm8, (%%r11) \n\t" - "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" - " \n\t" - "vmovups %%ymm12, (%%r12) \n\t" - "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CDONE: \n\t" - " \n\t" - " \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CROWSTORBZ) + + + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx, rdx, 1)) + + vmovups(ymm8, mem(r11)) + vmovups(ymm9, mem(r11, rdx, 1)) + + vmovups(ymm12, mem(r12)) + vmovups(ymm13, mem(r12, rdx, 1)) + + + + + + + label(.CDONE) + + : // output operands (none) : // input operands @@ -2171,29 +2173,29 @@ void bli_cgemm_zen_asm_3x8 // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ - "vmovupd (%%rcx), %%xmm0 \n\t" \ - "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ - "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ - "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ - "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovupd(mem(rcx), xmm0) \ + vmovupd(mem(rcx, rsi, 1), xmm3) \ + vinsertf128(imm(1), xmm3, ymm0, ymm0) \ + vpermilpd(imm(0x5), ymm0, ymm3) \ + vmulpd(ymm1, ymm0, ymm0) \ + vmulpd(ymm2, ymm3, ymm3) \ + vaddsubpd(ymm3, ymm0, ymm0) // assumes values to output are in ymm0 #define ZGEMM_OUTPUT_GS \ - "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ - "vmovupd %%xmm0, (%%rcx) \n\t" \ - "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ + vextractf128(imm(1), ymm0, xmm3) \ + vmovupd(xmm0, mem(rcx)) \ + vmovupd(xmm3, mem(rcx, rsi, 1)) \ #define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ - "vmovupd (%%rcx), %%ymm0 \n\t" \ - "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ - "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovupd(mem(rcx), ymm0) \ + vpermilpd(imm(0x5), ymm0, ymm3) \ + vmulpd(ymm1, ymm0, ymm0) \ + vmulpd(ymm2, ymm3, ymm3) \ + vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_RS \ - "vmovupd %%ymm0, (%%rcx) \n\t" \ + vmovupd(ymm0, mem(rcx)) \ void bli_zgemm_zen_asm_3x4 ( @@ -2219,455 +2221,455 @@ void bli_zgemm_zen_asm_3x4 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(dcomplex) - "leaq (,%%rdi,2), %%rdi \n\t" - " \n\t" - "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; - "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 32 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 38 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr) - "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 32 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr) - "addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".ZPOSTACCUM: \n\t" - " \n\t" - " \n\t" // permute even and odd elements - " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 - "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" - "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" - "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" // subtract/add even/odd elements - "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" - "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" - " \n\t" - "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" - "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate - "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate - "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(dcomplex) - "leaq (,%%rsi,2), %%rsi \n\t" - "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. - "jz .ZROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%7, rdi) // load rs_c + lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex) + lea(mem(, rdi, 2), rdi) + + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; + lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.ZCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.ZLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 32*16)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, -2*32), ymm0) + vmovapd(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rax, 6*8), ymm2) + vbroadcastsd(mem(rax, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 8*8), ymm2) + vbroadcastsd(mem(rax, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 10*8), ymm2) + vbroadcastsd(mem(rax, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 0*32), ymm0) + vmovapd(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 38*16)) + + vbroadcastsd(mem(rax, 12*8), ymm2) + vbroadcastsd(mem(rax, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 14*8), ymm2) + vbroadcastsd(mem(rax, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 16*8), ymm2) + vbroadcastsd(mem(rax, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 2*32), ymm0) + vmovapd(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rax, 18*8), ymm2) + vbroadcastsd(mem(rax, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 20*8), ymm2) + vbroadcastsd(mem(rax, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 22*8), ymm2) + vbroadcastsd(mem(rax, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*3*16), rax) // a += 4*3 (unroll x mr) + add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKITER) // iterate again if i != 0. + + + + + + + label(.ZCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.ZLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 32*16)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*3*16), rax) // a += 1*3 (unroll x mr) + add(imm(1*4*16), rbx) // b += 1*4 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKLEFT) // iterate again if i != 0. + + + + label(.ZPOSTACCUM) + + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 + vpermilpd(imm(0x5), ymm6, ymm6) + vpermilpd(imm(0x5), ymm7, ymm7) + vpermilpd(imm(0x5), ymm10, ymm10) + vpermilpd(imm(0x5), ymm11, ymm11) + vpermilpd(imm(0x5), ymm14, ymm14) + vpermilpd(imm(0x5), ymm15, ymm15) + + + // subtract/add even/odd elements + vaddsubpd(ymm6, ymm4, ymm4) + vaddsubpd(ymm7, ymm5, ymm5) + + vaddsubpd(ymm10, ymm8, ymm8) + vaddsubpd(ymm11, ymm9, ymm9) + + vaddsubpd(ymm14, ymm12, ymm12) + vaddsubpd(ymm15, ymm13, ymm13) + + + + + mov(%4, rax) // load address of alpha + vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate + vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate + + + vpermilpd(imm(0x5), ymm4, ymm3) + vmulpd(ymm0, ymm4, ymm4) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm4, ymm4) + + vpermilpd(imm(0x5), ymm5, ymm3) + vmulpd(ymm0, ymm5, ymm5) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm5, ymm5) + + + vpermilpd(imm(0x5), ymm8, ymm3) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm9, ymm3) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm9, ymm9) + + + vpermilpd(imm(0x5), ymm12, ymm3) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm12, ymm12) + + vpermilpd(imm(0x5), ymm13, ymm3) + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm13, ymm13) + + + + + + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate + vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate + + + + + mov(%8, rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex) + lea(mem(, rsi, 2), rsi) + lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c; + + + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. + jz(.ZROWSTORED) // jump to row storage case + + + + label(.ZGENSTORED) + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZROWSTORED: \n\t" - " \n\t" - " \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZROWSTORED) + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_RS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_RS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_RS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*cs_c; + + ZGEMM_INPUT_SCALE_RS_BETA_NZ - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_RS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZBETAZERO: \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. - "jz .ZROWSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZBETAZERO) + + cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. + jz(.ZROWSTORBZ) // jump to row storage case + + + + label(.ZGENSTORBZ) + + + vmovapd(ymm4, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovapd(ymm5, ymm0) ZGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c - " \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + mov(r11, rcx) // rcx = c + 1*rs_c + + + + vmovapd(ymm8, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovapd(ymm9, ymm0) ZGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c - " \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + mov(r12, rcx) // rcx = c + 2*rs_c + + + + vmovapd(ymm12, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*cs_c; + + + vmovapd(ymm13, ymm0) ZGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t" - " \n\t" - "vmovupd %%ymm8, (%%r11) \n\t" - "vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t" - " \n\t" - "vmovupd %%ymm12, (%%r12) \n\t" - "vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZDONE: \n\t" - " \n\t" - " \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZROWSTORBZ) + + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx, rdx, 1)) + + vmovupd(ymm8, mem(r11)) + vmovupd(ymm9, mem(r11, rdx, 1)) + + vmovupd(ymm12, mem(r12)) + vmovupd(ymm13, mem(r12, rdx, 1)) + + + + + + + label(.ZDONE) + + : // output operands (none) : // input operands @@ -2693,3 +2695,4 @@ void bli_zgemm_zen_asm_3x4 ); } + diff --git a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c b/kernels/zen/3/bli_gemm_zen_asm_d8x6.c index d55214737..46179b1af 100644 --- a/kernels/zen/3/bli_gemm_zen_asm_d8x6.c +++ b/kernels/zen/3/bli_gemm_zen_asm_d8x6.c @@ -34,15 +34,17 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" #define SGEMM_INPUT_GS_BETA_NZ \ - "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + vmovlps(mem(rcx), xmm0, xmm0) \ + vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlps(mem(rcx, rsi, 2), xmm1, xmm1) \ + vmovhps(mem(rcx, r13, 1), xmm1, xmm1) \ + vshufps(imm(0x88), xmm1, xmm0, xmm0) \ + vmovlps(mem(rcx, rsi, 4), xmm2, xmm2) \ + vmovhps(mem(rcx, r15, 1), xmm2, xmm2) \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss @@ -50,29 +52,29 @@ (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ - "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ - "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ - "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ - "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + vmovss(mem(rcx, r10, 1), xmm1) \ + vpermilps(imm(0xcf), xmm1, xmm1) \ + vmovlps(mem(rcx, r13, 2), xmm1, xmm1) \ + /*vmovhps(mem(rcx, r10, 1), xmm1, xmm1)*/ \ + vshufps(imm(0x88), xmm1, xmm2, xmm2) \ + vperm2f128(imm(0x20), ymm2, ymm0, ymm0) #define SGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ - "vmovss %%xmm0, (%%rcx ) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ - "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ - "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ - "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" + vextractf128(imm(1), ymm0, xmm2) \ + vmovss(xmm0, mem(rcx)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, rsi, 1)) \ + vpermilps(imm(0x39), xmm1, xmm0) \ + vmovss(xmm0, mem(rcx, rsi, 2)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, r13, 1)) \ + vmovss(xmm2, mem(rcx, rsi, 4)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r15, 1)) \ + vpermilps(imm(0x39), xmm1, xmm2) \ + vmovss(xmm2, mem(rcx, r13, 2)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemm_zen_asm_16x6 ( @@ -98,519 +100,519 @@ void bli_sgemm_zen_asm_16x6 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rax \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 128 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 152 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 16 * 4, %%rax \n\t" // a += 4*16 (unroll x mr) - "addq $4 * 6 * 4, %%rbx \n\t" // b += 4*6 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 128 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 16 * 4, %%rax \n\t" // a += 1*16 (unroll x mr) - "addq $1 * 6 * 4, %%rbx \n\t" // b += 1*6 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c + 8*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; - "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "jz .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rax) + // initialize loop by pre-loading + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 128*4)) + + vbroadcastss(mem(rbx, 0*4), ymm2) + vbroadcastss(mem(rbx, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 2*4), ymm2) + vbroadcastss(mem(rbx, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 4*4), ymm2) + vbroadcastss(mem(rbx, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, -2*32), ymm0) + vmovaps(mem(rax, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rbx, 6*4), ymm2) + vbroadcastss(mem(rbx, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 8*4), ymm2) + vbroadcastss(mem(rbx, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 10*4), ymm2) + vbroadcastss(mem(rbx, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, 0*32), ymm0) + vmovaps(mem(rax, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 152*4)) + + vbroadcastss(mem(rbx, 12*4), ymm2) + vbroadcastss(mem(rbx, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 14*4), ymm2) + vbroadcastss(mem(rbx, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 16*4), ymm2) + vbroadcastss(mem(rbx, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, 2*32), ymm0) + vmovaps(mem(rax, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rbx, 18*4), ymm2) + vbroadcastss(mem(rbx, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 20*4), ymm2) + vbroadcastss(mem(rbx, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 22*4), ymm2) + vbroadcastss(mem(rbx, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) + add(imm(4*6*4), rbx) // b += 4*6 (unroll x nr) + + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 128*4)) + + vbroadcastss(mem(rbx, 0*4), ymm2) + vbroadcastss(mem(rbx, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 2*4), ymm2) + vbroadcastss(mem(rbx, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 4*4), ymm2) + vbroadcastss(mem(rbx, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*16*4), rax) // a += 1*16 (unroll x mr) + add(imm(1*6*4), rbx) // b += 1*6 (unroll x nr) + + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rax), ymm0) // load alpha and duplicate + vbroadcastss(mem(rbx), ymm3) // load beta and duplicate + + vmulps(ymm0, ymm4, ymm4) // scale by alpha + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm0, ymm6, ymm6) + vmulps(ymm0, ymm7, ymm7) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm0, ymm10, ymm10) + vmulps(ymm0, ymm11, ymm11) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm0, ymm14, ymm14) + vmulps(ymm0, ymm15, ymm15) + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) + + lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*rs_c; + + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c; + lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c; + + + // now avoid loading C if beta == 0 + + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm3) // set ZF if beta == 0. + je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + + + label(.SGENSTORED) + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm4, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm6, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm8, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm10, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm12, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm14, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c - " \n\t" - " \n\t" + //add(rdi, rcx) // c += cs_c; + + + mov(rdx, rcx) // rcx = c + 8*rs_c + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm5, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm7, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm9, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm11, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm13, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + SGEMM_INPUT_GS_BETA_NZ - "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" + vfmadd213ps(ymm15, ymm3, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SBETAZERO: \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "jz .SCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".SGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + //add(rdi, rcx) // c += cs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORED) + + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm5) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm7) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm9) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm11) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm13) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vfmadd231ps(mem(rdx), ymm3, ymm15) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + + jmp(.SDONE) // jump to end. + + + + label(.SBETAZERO) + + cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORBZ) // jump to column storage case + + + + label(.SGENSTORBZ) + + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + //add(rdi, rcx) // c += cs_c; + + + mov(rdx, rcx) // rcx = c + 8*rs_c + + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" - " \n\t" + //add(rdi, rcx) // c += cs_c; + + + + jmp(.SDONE) // jump to end. + + + + label(.SCOLSTORBZ) + + + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + + + + + label(.SDONE) + + : // output operands (none) : // input operands @@ -637,28 +639,28 @@ void bli_sgemm_zen_asm_16x6 } #define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ + vmovlpd(mem(rcx), xmm0, xmm0) \ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) \ + vmovhpd(mem(rcx, r13, 1), xmm1, xmm1) \ + vperm2f128(imm(0x20), ymm1, ymm0, ymm0) /*\ + vmovlpd(mem(rcx, rsi, 4), xmm2, xmm2) \ + vmovhpd(mem(rcx, r15, 1), xmm2, xmm2) \ + vmovlpd(mem(rcx, r13, 2), xmm1, xmm1) \ + vmovhpd(mem(rcx, r10, 1), xmm1, xmm1) \ + vperm2f128(imm(0x20), ymm1, ymm2, ymm2)*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ + vextractf128(imm(1), ymm0, xmm1) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm1, mem(rcx, rsi, 2)) \ + vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ + vextractf128(imm(1), ymm2, xmm1) \ + vmovlpd(xmm2, mem(rcx, rsi, 4)) \ + vmovhpd(xmm2, mem(rcx, r15, 1)) \ + vmovlpd(xmm1, mem(rcx, r13, 2)) \ + vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemm_zen_asm_8x6 ( @@ -684,518 +686,518 @@ void bli_dgemm_zen_asm_8x6 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rax \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "addq $4 * 6 * 8, %%rbx \n\t" // b += 4*6 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) - "addq $1 * 6 * 8, %%rbx \n\t" // b += 1*6 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rax) + // initialize loop by pre-loading + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rbx, 0*8), ymm2) + vbroadcastsd(mem(rbx, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 2*8), ymm2) + vbroadcastsd(mem(rbx, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 4*8), ymm2) + vbroadcastsd(mem(rbx, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, -2*32), ymm0) + vmovapd(mem(rax, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rbx, 6*8), ymm2) + vbroadcastsd(mem(rbx, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 8*8), ymm2) + vbroadcastsd(mem(rbx, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 10*8), ymm2) + vbroadcastsd(mem(rbx, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, 0*32), ymm0) + vmovapd(mem(rax, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*8)) + + vbroadcastsd(mem(rbx, 12*8), ymm2) + vbroadcastsd(mem(rbx, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 14*8), ymm2) + vbroadcastsd(mem(rbx, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 16*8), ymm2) + vbroadcastsd(mem(rbx, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, 2*32), ymm0) + vmovapd(mem(rax, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rbx, 18*8), ymm2) + vbroadcastsd(mem(rbx, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 20*8), ymm2) + vbroadcastsd(mem(rbx, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 22*8), ymm2) + vbroadcastsd(mem(rbx, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + add(imm(4*6*8), rbx) // b += 4*6 (unroll x nr) + + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rbx, 0*8), ymm2) + vbroadcastsd(mem(rbx, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 2*8), ymm2) + vbroadcastsd(mem(rbx, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 4*8), ymm2) + vbroadcastsd(mem(rbx, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) + add(imm(1*6*8), rbx) // b += 1*6 (unroll x nr) + + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + + + + mov(%4, rax) // load address of alpha + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate + + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) + vmulpd(ymm0, ymm6, ymm6) + vmulpd(ymm0, ymm7, ymm7) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm0, ymm10, ymm10) + vmulpd(ymm0, ymm11, ymm11) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm0, ymm14, ymm14) + vmulpd(ymm0, ymm15, ymm15) + + + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; + + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + //lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c; + //lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c; + + + // now avoid loading C if beta == 0 + + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + label(.DGENSTORED) + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm4, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm6, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm8, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm10, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm12, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm14, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" + //add(rdi, rcx) // c += cs_c; + + + mov(rdx, rcx) // rcx = c + 4*rs_c + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm5, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm7, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm9, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm11, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm13, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" + add(rdi, rcx) // c += cs_c; + + DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + vfmadd213pd(ymm15, ymm3, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + //add(rdi, rcx) // c += cs_c; + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORED) + + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm5) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm6) + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm7) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm8) + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm9) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm10) + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm11) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm12) + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm13) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + + vfmadd231pd(mem(rcx), ymm3, ymm14) + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vfmadd231pd(mem(rdx), ymm3, ymm15) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + jmp(.DDONE) // jump to end. + + + + label(.DBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case + + + + label(.DGENSTORBZ) + + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + //add(rdi, rcx) // c += cs_c; + + + mov(rdx, rcx) // rcx = c + 4*rs_c + + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c += cs_c; + + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - " \n\t" + //add(rdi, rcx) // c += cs_c; + + + + jmp(.DDONE) // jump to end. + + + + label(.DCOLSTORBZ) + + + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + + + + + + label(.DDONE) + + : // output operands (none) : // input operands @@ -1227,33 +1229,33 @@ void bli_dgemm_zen_asm_8x6 // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ - "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ - "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ - "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovlpd(mem(rcx), xmm0, xmm0) \ + vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ + vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ + vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ + vinsertf128(imm(1), xmm3, ymm0, ymm0) \ + vpermilps(imm(0xb1), ymm0, ymm3) \ + vmulps(ymm1, ymm0, ymm0) \ + vmulps(ymm2, ymm3, ymm3) \ + vaddsubps(ymm3, ymm0, ymm0) // assumes values to output are in ymm0 #define CGEMM_OUTPUT_GS \ - "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ - "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" + vextractf128(imm(1), ymm0, xmm3) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm3, mem(rcx, rsi, 2)) \ + vmovhpd(xmm3, mem(rcx, r13, 1)) #define CGEMM_INPUT_SCALE_CS_BETA_NZ \ - "vmovups (%%rcx), %%ymm0 \n\t" \ - "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ - "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovups(mem(rcx), ymm0) \ + vpermilps(imm(0xb1), ymm0, ymm3) \ + vmulps(ymm1, ymm0, ymm0) \ + vmulps(ymm2, ymm3, ymm3) \ + vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_CS \ - "vmovups %%ymm0, (%%rcx) \n\t" \ + vmovups(ymm0, mem(rcx)) \ void bli_cgemm_zen_asm_8x3 ( @@ -1279,455 +1281,455 @@ void bli_cgemm_zen_asm_8x3 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rax \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) - " \n\t" - "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*cs_c; - "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*cs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".CLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 32 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 38 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "addq $4 * 3 * 8, %%rbx \n\t" // b += 4*3 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".CLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 32 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) - "addq $1 * 3 * 8, %%rbx \n\t" // b += 1*3 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".CPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" // permute even and odd elements - " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 - "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" - "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" - "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" - "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" - "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" - "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" // subtract/add even/odd elements - "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" - "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" - " \n\t" - "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" - "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" - "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate - "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" - " \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" - "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate - "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) - "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*rs_c; - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .CCOLSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rax) + // initialize loop by pre-loading + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) + + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; + lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.CCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.CLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 32*8)) + + vbroadcastss(mem(rbx, 0*4), ymm2) + vbroadcastss(mem(rbx, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 2*4), ymm2) + vbroadcastss(mem(rbx, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 4*4), ymm2) + vbroadcastss(mem(rbx, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, -2*32), ymm0) + vmovaps(mem(rax, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rbx, 6*4), ymm2) + vbroadcastss(mem(rbx, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 8*4), ymm2) + vbroadcastss(mem(rbx, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 10*4), ymm2) + vbroadcastss(mem(rbx, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, 0*32), ymm0) + vmovaps(mem(rax, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 38*8)) + + vbroadcastss(mem(rbx, 12*4), ymm2) + vbroadcastss(mem(rbx, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 14*4), ymm2) + vbroadcastss(mem(rbx, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 16*4), ymm2) + vbroadcastss(mem(rbx, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rax, 2*32), ymm0) + vmovaps(mem(rax, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rbx, 18*4), ymm2) + vbroadcastss(mem(rbx, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 20*4), ymm2) + vbroadcastss(mem(rbx, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 22*4), ymm2) + vbroadcastss(mem(rbx, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) + add(imm(4*3*8), rbx) // b += 4*3 (unroll x nr) + + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.CLOOPKITER) // iterate again if i != 0. + + + + + + + label(.CCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.CPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.CLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 32*8)) + + vbroadcastss(mem(rbx, 0*4), ymm2) + vbroadcastss(mem(rbx, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rbx, 2*4), ymm2) + vbroadcastss(mem(rbx, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rbx, 4*4), ymm2) + vbroadcastss(mem(rbx, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) + add(imm(1*3*8), rbx) // b += 1*3 (unroll x nr) + + vmovaps(mem(rax, -4*32), ymm0) + vmovaps(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.CLOOPKLEFT) // iterate again if i != 0. + + + + label(.CPOSTACCUM) + + + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 + vpermilps(imm(0xb1), ymm6, ymm6) + vpermilps(imm(0xb1), ymm7, ymm7) + vpermilps(imm(0xb1), ymm10, ymm10) + vpermilps(imm(0xb1), ymm11, ymm11) + vpermilps(imm(0xb1), ymm14, ymm14) + vpermilps(imm(0xb1), ymm15, ymm15) + + + // subtract/add even/odd elements + vaddsubps(ymm6, ymm4, ymm4) + vaddsubps(ymm7, ymm5, ymm5) + + vaddsubps(ymm10, ymm8, ymm8) + vaddsubps(ymm11, ymm9, ymm9) + + vaddsubps(ymm14, ymm12, ymm12) + vaddsubps(ymm15, ymm13, ymm13) + + + + + mov(%4, rax) // load address of alpha + vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate + vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate + + + vpermilps(imm(0xb1), ymm4, ymm3) + vmulps(ymm0, ymm4, ymm4) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm4, ymm4) + + vpermilps(imm(0xb1), ymm5, ymm3) + vmulps(ymm0, ymm5, ymm5) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm5, ymm5) + + + vpermilps(imm(0xb1), ymm8, ymm3) + vmulps(ymm0, ymm8, ymm8) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm8, ymm8) + + vpermilps(imm(0xb1), ymm9, ymm3) + vmulps(ymm0, ymm9, ymm9) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm9, ymm9) + + + vpermilps(imm(0xb1), ymm12, ymm3) + vmulps(ymm0, ymm12, ymm12) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm12, ymm12) + + vpermilps(imm(0xb1), ymm13, ymm3) + vmulps(ymm0, ymm13, ymm13) + vmulps(ymm1, ymm3, ymm3) + vaddsubps(ymm3, ymm13, ymm13) + + + + + + mov(%5, rbx) // load address of beta + vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate + vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) + lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c; + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; + + + + // now avoid loading C if beta == 0 + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.CCOLSTORED) // jump to row storage case + + + + label(.CGENSTORED) + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORED: \n\t" - " \n\t" - " \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORED) + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_CS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_CS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 4*rs_c; + + CGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_CS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "jz .CCOLSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".CGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CBETAZERO) + + cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. + jz(.CCOLSTORBZ) // jump to row storage case + + + + label(.CGENSTORBZ) + + + vmovaps(ymm4, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + add(rdx, rcx) // c += 4*rs_c; + + + vmovaps(ymm5, ymm0) CGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + + vmovaps(ymm8, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdx, rcx) // c += 4*rs_c; + + + vmovaps(ymm9, ymm0) CGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + + vmovaps(ymm12, ymm0) CGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdx, rcx) // c += 4*rs_c; + + + vmovaps(ymm13, ymm0) CGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .CDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".CCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" - " \n\t" - "vmovups %%ymm8, (%%r11) \n\t" - "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" - " \n\t" - "vmovups %%ymm12, (%%r12) \n\t" - "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".CDONE: \n\t" - " \n\t" - " \n\t" + + + + jmp(.CDONE) // jump to end. + + + + label(.CCOLSTORBZ) + + + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx, rdx, 1)) + + vmovups(ymm8, mem(r11)) + vmovups(ymm9, mem(r11, rdx, 1)) + + vmovups(ymm12, mem(r12)) + vmovups(ymm13, mem(r12, rdx, 1)) + + + + + + + label(.CDONE) + + : // output operands (none) : // input operands @@ -1759,29 +1761,29 @@ void bli_cgemm_zen_asm_8x3 // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ - "vmovupd (%%rcx), %%xmm0 \n\t" \ - "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ - "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ - "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ - "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovupd(mem(rcx), xmm0) \ + vmovupd(mem(rcx, rsi, 1), xmm3) \ + vinsertf128(imm(1), xmm3, ymm0, ymm0) \ + vpermilpd(imm(0x5), ymm0, ymm3) \ + vmulpd(ymm1, ymm0, ymm0) \ + vmulpd(ymm2, ymm3, ymm3) \ + vaddsubpd(ymm3, ymm0, ymm0) // assumes values to output are in ymm0 #define ZGEMM_OUTPUT_GS \ - "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ - "vmovupd %%xmm0, (%%rcx) \n\t" \ - "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ + vextractf128(imm(1), ymm0, xmm3) \ + vmovupd(xmm0, mem(rcx)) \ + vmovupd(xmm3, mem(rcx, rsi, 1)) \ #define ZGEMM_INPUT_SCALE_CS_BETA_NZ \ - "vmovups (%%rcx), %%ymm0 \n\t" \ - "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ - "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ - "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ - "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + vmovups(mem(rcx), ymm0) \ + vpermilpd(imm(0x5), ymm0, ymm3) \ + vmulpd(ymm1, ymm0, ymm0) \ + vmulpd(ymm2, ymm3, ymm3) \ + vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_CS \ - "vmovupd %%ymm0, (%%rcx) \n\t" \ + vmovupd(ymm0, mem(rcx)) \ void bli_zgemm_zen_asm_4x3 ( @@ -1807,455 +1809,455 @@ void bli_zgemm_zen_asm_4x3 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rax \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) - "leaq (,%%rdi,2), %%rdi \n\t" - " \n\t" - "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*cs_c; - "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*cs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 32 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 38 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr) - "addq $4 * 3 * 16, %%rbx \n\t" // b += 4*3 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".ZLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 32 * 16(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 4 * 16, %%rax \n\t" // a += 1*4 (unroll x mr) - "addq $1 * 3 * 16, %%rbx \n\t" // b += 1*3 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".ZPOSTACCUM: \n\t" - " \n\t" - " \n\t" // permute even and odd elements - " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 - "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" - "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" - "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" - "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" // subtract/add even/odd elements - "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" - "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" - " \n\t" - "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" - "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" - "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate - "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" - " \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" - " \n\t" - "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" - "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate - "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex) - "leaq (,%%rsi,2), %%rsi \n\t" - "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. - "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); - "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. - "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); - "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. - "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "jz .ZCOLSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORED: \n\t" - " \n\t" - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + //mov(%9, r15) // load address of b_next. + + add(imm(32*4), rax) + // initialize loop by pre-loading + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + mov(%6, rcx) // load address of c + mov(%8, rdi) // load cs_c + lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) + lea(mem(, rdi, 2), rdi) + + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; + lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c + + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.ZCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.ZLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 32*16)) + + vbroadcastsd(mem(rbx, 0*8), ymm2) + vbroadcastsd(mem(rbx, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 2*8), ymm2) + vbroadcastsd(mem(rbx, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 4*8), ymm2) + vbroadcastsd(mem(rbx, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, -2*32), ymm0) + vmovapd(mem(rax, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rbx, 6*8), ymm2) + vbroadcastsd(mem(rbx, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 8*8), ymm2) + vbroadcastsd(mem(rbx, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 10*8), ymm2) + vbroadcastsd(mem(rbx, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, 0*32), ymm0) + vmovapd(mem(rax, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 38*16)) + + vbroadcastsd(mem(rbx, 12*8), ymm2) + vbroadcastsd(mem(rbx, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 14*8), ymm2) + vbroadcastsd(mem(rbx, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 16*8), ymm2) + vbroadcastsd(mem(rbx, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rax, 2*32), ymm0) + vmovapd(mem(rax, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rbx, 18*8), ymm2) + vbroadcastsd(mem(rbx, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 20*8), ymm2) + vbroadcastsd(mem(rbx, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 22*8), ymm2) + vbroadcastsd(mem(rbx, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) + add(imm(4*3*16), rbx) // b += 4*3 (unroll x nr) + + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKITER) // iterate again if i != 0. + + + + + + + label(.ZCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.ZLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 32*16)) + + vbroadcastsd(mem(rbx, 0*8), ymm2) + vbroadcastsd(mem(rbx, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rbx, 2*8), ymm2) + vbroadcastsd(mem(rbx, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rbx, 4*8), ymm2) + vbroadcastsd(mem(rbx, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*4*16), rax) // a += 1*4 (unroll x mr) + add(imm(1*3*16), rbx) // b += 1*3 (unroll x nr) + + vmovapd(mem(rax, -4*32), ymm0) + vmovapd(mem(rax, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.ZLOOPKLEFT) // iterate again if i != 0. + + + + label(.ZPOSTACCUM) + + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 + vpermilpd(imm(0x5), ymm6, ymm6) + vpermilpd(imm(0x5), ymm7, ymm7) + vpermilpd(imm(0x5), ymm10, ymm10) + vpermilpd(imm(0x5), ymm11, ymm11) + vpermilpd(imm(0x5), ymm14, ymm14) + vpermilpd(imm(0x5), ymm15, ymm15) + + + // subtract/add even/odd elements + vaddsubpd(ymm6, ymm4, ymm4) + vaddsubpd(ymm7, ymm5, ymm5) + + vaddsubpd(ymm10, ymm8, ymm8) + vaddsubpd(ymm11, ymm9, ymm9) + + vaddsubpd(ymm14, ymm12, ymm12) + vaddsubpd(ymm15, ymm13, ymm13) + + + + + mov(%4, rax) // load address of alpha + vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate + vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate + + + vpermilpd(imm(0x5), ymm4, ymm3) + vmulpd(ymm0, ymm4, ymm4) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm4, ymm4) + + vpermilpd(imm(0x5), ymm5, ymm3) + vmulpd(ymm0, ymm5, ymm5) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm5, ymm5) + + + vpermilpd(imm(0x5), ymm8, ymm3) + vmulpd(ymm0, ymm8, ymm8) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm8, ymm8) + + vpermilpd(imm(0x5), ymm9, ymm3) + vmulpd(ymm0, ymm9, ymm9) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm9, ymm9) + + + vpermilpd(imm(0x5), ymm12, ymm3) + vmulpd(ymm0, ymm12, ymm12) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm12, ymm12) + + vpermilpd(imm(0x5), ymm13, ymm3) + vmulpd(ymm0, ymm13, ymm13) + vmulpd(ymm1, ymm3, ymm3) + vaddsubpd(ymm3, ymm13, ymm13) + + + + + + mov(%5, rbx) // load address of beta + vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate + vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate + + + + + mov(%7, rsi) // load rs_c + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) + lea(mem(, rsi, 2), rsi) + lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c; + + + + // now avoid loading C if beta == 0 + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. + sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); + vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. + sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); + and(r8b, r9b) // set ZF if r8b & r9b == 1. + jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case + + + cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. + jz(.ZCOLSTORED) // jump to row storage case + + + + label(.ZGENSTORED) + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_GS_BETA_NZ - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORED: \n\t" - " \n\t" - " \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORED) + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_CS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_CS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_CS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" + add(rdx, rcx) // c += 2*rs_c; + + ZGEMM_INPUT_SCALE_CS_BETA_NZ - "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_CS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZBETAZERO: \n\t" - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "jz .ZCOLSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".ZGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZBETAZERO) + + cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. + jz(.ZCOLSTORBZ) // jump to row storage case + + + + label(.ZGENSTORBZ) + + + vmovapd(ymm4, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*rs_c; + + + vmovapd(ymm5, ymm0) ZGEMM_OUTPUT_GS - "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c - " \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + mov(r11, rcx) // rcx = c + 1*cs_c + + + + vmovapd(ymm8, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*rs_c; + + + vmovapd(ymm9, ymm0) ZGEMM_OUTPUT_GS - "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c - " \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + mov(r12, rcx) // rcx = c + 2*cs_c + + + + vmovapd(ymm12, ymm0) ZGEMM_OUTPUT_GS - "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdx, rcx) // c += 2*rs_c; + + + vmovapd(ymm13, ymm0) ZGEMM_OUTPUT_GS - " \n\t" - " \n\t" - " \n\t" - "jmp .ZDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".ZCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t" - " \n\t" - "vmovupd %%ymm8, (%%r11) \n\t" - "vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t" - " \n\t" - "vmovupd %%ymm12, (%%r12) \n\t" - "vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".ZDONE: \n\t" - " \n\t" - " \n\t" + + + + jmp(.ZDONE) // jump to end. + + + + label(.ZCOLSTORBZ) + + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx, rdx, 1)) + + vmovupd(ymm8, mem(r11)) + vmovupd(ymm9, mem(r11, rdx, 1)) + + vmovupd(ymm12, mem(r12)) + vmovupd(ymm13, mem(r12, rdx, 1)) + + + + + + + label(.ZDONE) + + : // output operands (none) : // input operands @@ -2281,3 +2283,4 @@ void bli_zgemm_zen_asm_4x3 ); } + diff --git a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c b/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c index 900c041b6..f8717384c 100644 --- a/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c @@ -35,22 +35,25 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #define SGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ - "vmovss %%xmm0, (%%rcx ) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ - "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ - "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ - "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" + vextractf128(imm(1), ymm0, xmm2) \ + vmovss(xmm0, mem(rcx)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, rsi, 1)) \ + vpermilps(imm(0x39), xmm1, xmm0) \ + vmovss(xmm0, mem(rcx, rsi, 2)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, r13, 1)) \ + vmovss(xmm2, mem(rcx, rsi, 4)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r15, 1)) \ + vpermilps(imm(0x39), xmm1, xmm2) \ + vmovss(xmm2, mem(rcx, r13, 2)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemmtrsm_l_zen_asm_6x16 @@ -80,694 +83,694 @@ void bli_sgemmtrsm_l_zen_asm_6x16 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %7, %%rcx \n\t" // load address of b11 - "movq $16, %%rdi \n\t" // set rs_b = PACKNR = 16 - "leaq (,%%rdi,4), %%rdi \n\t" // rs_b *= sizeof(float) - " \n\t" - " \n\t" // NOTE: c11, rs_c, and cs_c aren't - " \n\t" // needed for a while, but we load - " \n\t" // them now to avoid stalling later. - "movq %8, %%r8 \n\t" // load address of c11 - "movq %9, %%r9 \n\t" // load rs_c - "leaq (,%%r9 ,4), %%r9 \n\t" // rs_c *= sizeof(float) - "movq %10, %%r10 \n\t" // load cs_c - "leaq (,%%r10,4), %%r10 \n\t" // cs_c *= sizeof(float) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 4, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 16 * 4, %%rbx \n\t" // b += 4*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 4, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 16 * 4, %%rbx \n\t" // b += 1*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm4..ymm15 = -a10 * b01 - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of alpha - "vbroadcastss (%%rbx), %%ymm3 \n\t" // load alpha and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq $1, %%rsi \n\t" // load cs_b = 1 - "leaq (,%%rsi,4), %%rsi \n\t" // cs_b *= sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of b11 + 8*cs_b - " \n\t" - "movq %%rcx, %%r11 \n\t" // save rcx = b11 for later - "movq %%rdx, %%r14 \n\t" // save rdx = b11+8*cs_b for later - " \n\t" - " \n\t" - " \n\t" // b11 := alpha * b11 - a10 * b01 - "vfmsub231ps (%%rcx), %%ymm3, %%ymm4 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm5 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm6 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm7 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm8 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm9 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm10 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm11 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm12 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm13 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm14 \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm15 \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // prefetch c11 - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + mov(%7, rcx) // load address of b11 + mov(imm(16), rdi) // set rs_b = PACKNR = 16 + lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) + + // NOTE: c11, rs_c, and cs_c aren't + // needed for a while, but we load + // them now to avoid stalling later. + mov(%8, r8) // load address of c11 + mov(%9, r9) // load rs_c + lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) + mov(%10, r10) // load cs_c + lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, -2*32), ymm0) + vmovaps(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rax, 6*4), ymm2) + vbroadcastss(mem(rax, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 8*4), ymm2) + vbroadcastss(mem(rax, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 10*4), ymm2) + vbroadcastss(mem(rax, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 0*32), ymm0) + vmovaps(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*4)) + + vbroadcastss(mem(rax, 12*4), ymm2) + vbroadcastss(mem(rax, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 14*4), ymm2) + vbroadcastss(mem(rax, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 16*4), ymm2) + vbroadcastss(mem(rax, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 2*32), ymm0) + vmovaps(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rax, 18*4), ymm2) + vbroadcastss(mem(rax, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 20*4), ymm2) + vbroadcastss(mem(rax, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 22*4), ymm2) + vbroadcastss(mem(rax, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) + add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) + add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + // ymm4..ymm15 = -a10 * b01 + + + + mov(%5, rbx) // load address of alpha + vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate + + + + + mov(imm(1), rsi) // load cs_b = 1 + lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) + + lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b + + mov(rcx, r11) // save rcx = b11 for later + mov(rdx, r14) // save rdx = b11+8*cs_b for later + + + // b11 := alpha * b11 - a10 * b01 + vfmsub231ps(mem(rcx), ymm3, ymm4) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm5) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm6) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm7) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm8) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm9) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm10) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm11) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm12) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm13) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm14) + //add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm15) + //add(rdi, rdx) + + + + // prefetch c11 + #if 0 - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - " \n\t" // Note: r9 = rs_c * sizeof(float) - " \n\t" - "leaq (%%r9 ,%%r9 ,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c11 + 3*rs_c; - " \n\t" - "prefetcht0 0 * 8(%%rcx) \n\t" // prefetch c11 + 0*rs_c - "prefetcht0 0 * 8(%%rcx,%%r9 ) \n\t" // prefetch c11 + 1*rs_c - "prefetcht0 0 * 8(%%rcx,%%r9 ,2) \n\t" // prefetch c11 + 2*rs_c - "prefetcht0 0 * 8(%%rdx) \n\t" // prefetch c11 + 3*rs_c - "prefetcht0 0 * 8(%%rdx,%%r9 ) \n\t" // prefetch c11 + 4*rs_c - "prefetcht0 0 * 8(%%rdx,%%r9 ,2) \n\t" // prefetch c11 + 5*rs_c + mov(r8, rcx) // load address of c11 from r8 + // Note: r9 = rs_c * sizeof(float) + + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; + + prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c + prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c + prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c + prefetch(0, mem(rdx, 0*8)) // prefetch c11 + 3*rs_c + prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c + prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // trsm computation begins here - " \n\t" - " \n\t" // Note: contents of b11 are stored as - " \n\t" // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) - " \n\t" // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) - " \n\t" // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) - " \n\t" // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) - " \n\t" // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) - " \n\t" // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - " \n\t" - " \n\t" - "movq %6, %%rax \n\t" // load address of a11 - " \n\t" - "movq %%r11, %%rcx \n\t" // recall address of b11 - "movq %%r14, %%rdx \n\t" // recall address of b11+8*cs_b - " \n\t" // Note: rdi = rs_b - " \n\t" - " \n\t" // iteration 0 ------------- - " \n\t" - "vbroadcastss (0+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha00) - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // ymm4 *= (1/alpha00) - "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" // ymm5 *= (1/alpha00) - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" // store ( beta00..beta07 ) = ymm4 - "vmovups %%ymm5, (%%rdx) \n\t" // store ( beta08..beta0F ) = ymm5 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 1 ------------- - " \n\t" - "vbroadcastss (1+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha10 - "vbroadcastss (1+1*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha11) - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha10 * ymm4 - "vmulps %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha10 * ymm5 - " \n\t" - "vsubps %%ymm2, %%ymm6, %%ymm6 \n\t" // ymm6 -= ymm2 - "vsubps %%ymm3, %%ymm7, %%ymm7 \n\t" // ymm7 -= ymm3 - " \n\t" - "vmulps %%ymm6, %%ymm1, %%ymm6 \n\t" // ymm6 *= (1/alpha11) - "vmulps %%ymm7, %%ymm1, %%ymm7 \n\t" // ymm7 *= (1/alpha11) - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" // store ( beta10..beta17 ) = ymm6 - "vmovups %%ymm7, (%%rdx) \n\t" // store ( beta18..beta1F ) = ymm7 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 2 ------------- - " \n\t" - "vbroadcastss (2+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha20 - "vbroadcastss (2+1*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha21 - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha20 * ymm4 - "vmulps %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha20 * ymm5 - " \n\t" - "vbroadcastss (2+2*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha22) - " \n\t" - "vfmadd231ps %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha21 * ymm6 - "vfmadd231ps %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha21 * ymm7 - " \n\t" - "vsubps %%ymm2, %%ymm8, %%ymm8 \n\t" // ymm8 -= ymm2 - "vsubps %%ymm3, %%ymm9, %%ymm9 \n\t" // ymm9 -= ymm3 - " \n\t" - "vmulps %%ymm8, %%ymm0, %%ymm8 \n\t" // ymm8 *= (1/alpha22) - "vmulps %%ymm9, %%ymm0, %%ymm9 \n\t" // ymm9 *= (1/alpha22) - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" // store ( beta20..beta27 ) = ymm8 - "vmovups %%ymm9, (%%rdx) \n\t" // store ( beta28..beta2F ) = ymm9 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 3 ------------- - " \n\t" - "vbroadcastss (3+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha30 - "vbroadcastss (3+1*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha31 - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha30 * ymm4 - "vmulps %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha30 * ymm5 - " \n\t" - "vbroadcastss (3+2*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha32 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha31 * ymm6 - "vfmadd231ps %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha31 * ymm7 - " \n\t" - "vbroadcastss (3+3*6)*4(%%rax), %%ymm1 \n\t" // ymm0 = (1/alpha33) - " \n\t" - "vfmadd231ps %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha32 * ymm8 - "vfmadd231ps %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha32 * ymm9 - " \n\t" - "vsubps %%ymm2, %%ymm10, %%ymm10 \n\t" // ymm10 -= ymm2 - "vsubps %%ymm3, %%ymm11, %%ymm11 \n\t" // ymm11 -= ymm3 - " \n\t" - "vmulps %%ymm10, %%ymm1, %%ymm10 \n\t" // ymm10 *= (1/alpha33) - "vmulps %%ymm11, %%ymm1, %%ymm11 \n\t" // ymm11 *= (1/alpha33) - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" // store ( beta30..beta37 ) = ymm10 - "vmovups %%ymm11, (%%rdx) \n\t" // store ( beta38..beta3F ) = ymm11 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 4 ------------- - " \n\t" - "vbroadcastss (4+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha40 - "vbroadcastss (4+1*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha41 - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha40 * ymm4 - "vmulps %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha40 * ymm5 - " \n\t" - "vbroadcastss (4+2*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha42 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha41 * ymm6 - "vfmadd231ps %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha41 * ymm7 - " \n\t" - "vbroadcastss (4+3*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha43 - " \n\t" - "vfmadd231ps %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha42 * ymm8 - "vfmadd231ps %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha42 * ymm9 - " \n\t" - "vbroadcastss (4+4*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha44) - " \n\t" - "vfmadd231ps %%ymm1, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha43 * ymm10 - "vfmadd231ps %%ymm1, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha43 * ymm11 - " \n\t" - "vsubps %%ymm2, %%ymm12, %%ymm12 \n\t" // ymm12 -= ymm2 - "vsubps %%ymm3, %%ymm13, %%ymm13 \n\t" // ymm13 -= ymm3 - " \n\t" - "vmulps %%ymm12, %%ymm0, %%ymm12 \n\t" // ymm12 *= (1/alpha44) - "vmulps %%ymm13, %%ymm0, %%ymm13 \n\t" // ymm13 *= (1/alpha44) - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" // store ( beta40..beta47 ) = ymm12 - "vmovups %%ymm13, (%%rdx) \n\t" // store ( beta48..beta4F ) = ymm13 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 5 ------------- - " \n\t" - "vbroadcastss (5+0*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha50 - "vbroadcastss (5+1*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha51 - " \n\t" - "vmulps %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha50 * ymm4 - "vmulps %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha50 * ymm5 - " \n\t" - "vbroadcastss (5+2*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha52 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha51 * ymm6 - "vfmadd231ps %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha51 * ymm7 - " \n\t" - "vbroadcastss (5+3*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha53 - " \n\t" - "vfmadd231ps %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha52 * ymm8 - "vfmadd231ps %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha52 * ymm9 - " \n\t" - "vbroadcastss (5+4*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha54 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha53 * ymm10 - "vfmadd231ps %%ymm1, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha53 * ymm11 - " \n\t" - "vbroadcastss (5+5*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha55) - " \n\t" - "vfmadd231ps %%ymm0, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha54 * ymm12 - "vfmadd231ps %%ymm0, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha54 * ymm13 - " \n\t" - "vsubps %%ymm2, %%ymm14, %%ymm14 \n\t" // ymm14 -= ymm2 - "vsubps %%ymm3, %%ymm15, %%ymm15 \n\t" // ymm15 -= ymm3 - " \n\t" - "vmulps %%ymm14, %%ymm1, %%ymm14 \n\t" // ymm14 *= (1/alpha55) - "vmulps %%ymm15, %%ymm1, %%ymm15 \n\t" // ymm15 *= (1/alpha55) - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" // store ( beta50..beta57 ) = ymm14 - "vmovups %%ymm15, (%%rdx) \n\t" // store ( beta58..beta5F ) = ymm15 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - "movq %%r9, %%rdi \n\t" // load rs_c (in bytes) from r9 - "movq %%r10, %%rsi \n\t" // load cs_c (in bytes) from r10 - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c11 + 8*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c11 + 4*rs_c; - " \n\t" - " \n\t" // These are used in the macros below. - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - "cmpq $4, %%rdi \n\t" // set ZF if (4*rs_c) == 4. - "jz .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - " \n\t" // if neither row- or column- - " \n\t" // stored, use general case. - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + + + + + // trsm computation begins here + + // Note: contents of b11 are stored as + // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) + // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) + // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) + // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) + // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) + // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) + + + mov(%6, rax) // load address of a11 + + mov(r11, rcx) // recall address of b11 + mov(r14, rdx) // recall address of b11+8*cs_b + // Note: rdi = rs_b + + // iteration 0 ------------- + + vbroadcastss(mem(0+0*6)*4(rax), ymm0) // ymm0 = (1/alpha00) + + vmulps(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) + vmulps(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) + + vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 + vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 1 ------------- + + vbroadcastss(mem(1+0*6)*4(rax), ymm0) // ymm0 = alpha10 + vbroadcastss(mem(1+1*6)*4(rax), ymm1) // ymm1 = (1/alpha11) + + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 + vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 + + vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 + vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 + + vmulps(ymm6, ymm1, ymm6) // ymm6 *= (1/alpha11) + vmulps(ymm7, ymm1, ymm7) // ymm7 *= (1/alpha11) + + vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 + vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 2 ------------- + + vbroadcastss(mem(2+0*6)*4(rax), ymm0) // ymm0 = alpha20 + vbroadcastss(mem(2+1*6)*4(rax), ymm1) // ymm1 = alpha21 + + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 + vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 + + vbroadcastss(mem(2+2*6)*4(rax), ymm0) // ymm0 = (1/alpha22) + + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 + vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 + + vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 + vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 + + vmulps(ymm8, ymm0, ymm8) // ymm8 *= (1/alpha22) + vmulps(ymm9, ymm0, ymm9) // ymm9 *= (1/alpha22) + + vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 + vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 3 ------------- + + vbroadcastss(mem(3+0*6)*4(rax), ymm0) // ymm0 = alpha30 + vbroadcastss(mem(3+1*6)*4(rax), ymm1) // ymm1 = alpha31 + + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 + vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 + + vbroadcastss(mem(3+2*6)*4(rax), ymm0) // ymm0 = alpha32 + + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 + vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 + + vbroadcastss(mem(3+3*6)*4(rax), ymm1) // ymm0 = (1/alpha33) + + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 + vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 + + vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 + vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 + + vmulps(ymm10, ymm1, ymm10) // ymm10 *= (1/alpha33) + vmulps(ymm11, ymm1, ymm11) // ymm11 *= (1/alpha33) + + vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 + vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 4 ------------- + + vbroadcastss(mem(4+0*6)*4(rax), ymm0) // ymm0 = alpha40 + vbroadcastss(mem(4+1*6)*4(rax), ymm1) // ymm1 = alpha41 + + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 + vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 + + vbroadcastss(mem(4+2*6)*4(rax), ymm0) // ymm0 = alpha42 + + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 + vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 + + vbroadcastss(mem(4+3*6)*4(rax), ymm1) // ymm1 = alpha43 + + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 + vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 + + vbroadcastss(mem(4+4*6)*4(rax), ymm0) // ymm0 = (1/alpha44) + + vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 + vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 + + vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 + vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 + + vmulps(ymm12, ymm0, ymm12) // ymm12 *= (1/alpha44) + vmulps(ymm13, ymm0, ymm13) // ymm13 *= (1/alpha44) + + vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 + vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 5 ------------- + + vbroadcastss(mem(5+0*6)*4(rax), ymm0) // ymm0 = alpha50 + vbroadcastss(mem(5+1*6)*4(rax), ymm1) // ymm1 = alpha51 + + vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 + vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 + + vbroadcastss(mem(5+2*6)*4(rax), ymm0) // ymm0 = alpha52 + + vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 + vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 + + vbroadcastss(mem(5+3*6)*4(rax), ymm1) // ymm1 = alpha53 + + vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 + vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 + + vbroadcastss(mem(5+4*6)*4(rax), ymm0) // ymm0 = alpha54 + + vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 + vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 + + vbroadcastss(mem(5+5*6)*4(rax), ymm1) // ymm1 = (1/alpha55) + + vfmadd231ps(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 + vfmadd231ps(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 + + vsubps(ymm2, ymm14, ymm14) // ymm14 -= ymm2 + vsubps(ymm3, ymm15, ymm15) // ymm15 -= ymm3 + + vmulps(ymm14, ymm1, ymm14) // ymm14 *= (1/alpha55) + vmulps(ymm15, ymm1, ymm15) // ymm15 *= (1/alpha55) + + vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 + vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + + + + + mov(r8, rcx) // load address of c11 from r8 + mov(r9, rdi) // load rs_c (in bytes) from r9 + mov(r10, rsi) // load cs_c (in bytes) from r10 + + lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; + + // These are used in the macros below. + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SROWSTORED) // jump to row storage case + + + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + + + // if neither row- or column- + // stored, use general case. + label(.SGENSTORED) + + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c11 + 8*cs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + + + mov(rdx, rcx) // rcx = c11 + 8*cs_c + + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SROWSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vunpcklps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpcklps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - "vmovups %%xmm3, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - "vmovups %%xmm3, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhps %%ymm14, %%ymm12, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd %%xmm1, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm1, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - "vmovlpd %%xmm3, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm3, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - "leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - "vunpcklps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpcklps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma08..gamma38 ) - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma09..gamma39 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma0C..gamma3C ) - "vmovups %%xmm3, (%%rcx,%%r15 ) \n\t" // store ( gamma0D..gamma3D ) - " \n\t" - "vunpckhps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma0A..gamma3A ) - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma0B..gamma3B ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma0E..gamma3E ) - "vmovups %%xmm3, (%%rcx,%%r10 ) \n\t" // store ( gamma0F..gamma3F ) - " \n\t" - //"leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhps %%ymm15, %%ymm13, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma48..gamma58 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma49..gamma59 ) - "vmovlpd %%xmm1, (%%r14,%%rsi,2) \n\t" // store ( gamma4A..gamma5A ) - "vmovhpd %%xmm1, (%%r14,%%r13 ) \n\t" // store ( gamma4B..gamma5B ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma4C..gamma5C ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma4D..gamma5D ) - "vmovlpd %%xmm3, (%%r14,%%r13,2) \n\t" // store ( gamma4E..gamma5E ) - "vmovhpd %%xmm3, (%%r14,%%r10 ) \n\t" // store ( gamma4F..gamma5F ) - " \n\t" - //"leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + + jmp(.SDONE) + + + + label(.SROWSTORED) + + + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + jmp(.SDONE) + + + + label(.SCOLSTORED) + + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm14, ymm12, ymm0) + vunpckhps(ymm14, ymm12, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) + vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) + vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm15, ymm13, ymm0) + vunpckhps(ymm15, ymm13, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) + vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) + vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma4B..gamma5B ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma4C..gamma5C ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) + vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) + vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) + + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + + label(.SDONE) + + vzeroupper() + : // output operands (none) : // input operands @@ -796,16 +799,16 @@ void bli_sgemmtrsm_l_zen_asm_6x16 #define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ + vextractf128(imm(1), ymm0, xmm1) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm1, mem(rcx, rsi, 2)) \ + vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ + vextractf128(imm(1), ymm2, xmm1) \ + vmovlpd(xmm2, mem(rcx, rsi, 4)) \ + vmovhpd(xmm2, mem(rcx, r15, 1)) \ + vmovlpd(xmm1, mem(rcx, r13, 2)) \ + vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_l_zen_asm_6x8 ( @@ -834,655 +837,655 @@ void bli_dgemmtrsm_l_zen_asm_6x8 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %7, %%rcx \n\t" // load address of b11 - "movq $8, %%rdi \n\t" // set rs_b = PACKNR = 8 - "leaq (,%%rdi,8), %%rdi \n\t" // rs_b *= sizeof(double) - " \n\t" - " \n\t" // NOTE: c11, rs_c, and cs_c aren't - " \n\t" // needed for a while, but we load - " \n\t" // them now to avoid stalling later. - "movq %8, %%r8 \n\t" // load address of c11 - "movq %9, %%r9 \n\t" // load rs_c - "leaq (,%%r9 ,8), %%r9 \n\t" // rs_c *= sizeof(double) - "movq %10, %%r10 \n\t" // load cs_c - "leaq (,%%r10,8), %%r10 \n\t" // cs_c *= sizeof(double) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm4..ymm15 = -a10 * b01 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of alpha - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load alpha and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq $1, %%rsi \n\t" // set cs_b = 1 - "leaq (,%%rsi,8), %%rsi \n\t" // cs_b *= sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of b11 + 4*cs_b - " \n\t" - "movq %%rcx, %%r11 \n\t" // save rcx = b11 for later - "movq %%rdx, %%r14 \n\t" // save rdx = b11+4*cs_b for later - " \n\t" - " \n\t" - " \n\t" // b11 := alpha * b11 - a10 * b01 - "vfmsub231pd (%%rcx), %%ymm3, %%ymm4 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm5 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm6 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm7 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm8 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm9 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm10 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm11 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm12 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm13 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm14 \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm15 \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // prefetch c11 - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + mov(%7, rcx) // load address of b11 + mov(imm(8), rdi) // set rs_b = PACKNR = 8 + lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) + + // NOTE: c11, rs_c, and cs_c aren't + // needed for a while, but we load + // them now to avoid stalling later. + mov(%8, r8) // load address of c11 + mov(%9, r9) // load rs_c + lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) + mov(%10, r10) // load cs_c + lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, -2*32), ymm0) + vmovapd(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rax, 6*8), ymm2) + vbroadcastsd(mem(rax, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 8*8), ymm2) + vbroadcastsd(mem(rax, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 10*8), ymm2) + vbroadcastsd(mem(rax, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 0*32), ymm0) + vmovapd(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*8)) + + vbroadcastsd(mem(rax, 12*8), ymm2) + vbroadcastsd(mem(rax, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 14*8), ymm2) + vbroadcastsd(mem(rax, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 16*8), ymm2) + vbroadcastsd(mem(rax, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 2*32), ymm0) + vmovapd(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rax, 18*8), ymm2) + vbroadcastsd(mem(rax, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 20*8), ymm2) + vbroadcastsd(mem(rax, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 22*8), ymm2) + vbroadcastsd(mem(rax, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) + add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) + add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + // ymm4..ymm15 = -a10 * b01 + + + + + mov(%5, rbx) // load address of alpha + vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate + + + + + mov(imm(1), rsi) // set cs_b = 1 + lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b + + mov(rcx, r11) // save rcx = b11 for later + mov(rdx, r14) // save rdx = b11+4*cs_b for later + + + // b11 := alpha * b11 - a10 * b01 + vfmsub231pd(mem(rcx), ymm3, ymm4) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm5) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm6) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm7) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm8) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm9) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm10) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm11) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm12) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm13) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm14) + //add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm15) + //add(rdi, rdx) + + + + // prefetch c11 + #if 0 - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - " \n\t" // Note: r9 = rs_c * sizeof(double) - " \n\t" - "leaq (%%r9 ,%%r9 ,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c11 + 3*rs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c11 + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%r9 ) \n\t" // prefetch c11 + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%r9 ,2) \n\t" // prefetch c11 + 2*rs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c11 + 3*rs_c - "prefetcht0 7 * 8(%%rdx,%%r9 ) \n\t" // prefetch c11 + 4*rs_c - "prefetcht0 7 * 8(%%rdx,%%r9 ,2) \n\t" // prefetch c11 + 5*rs_c + mov(r8, rcx) // load address of c11 from r8 + // Note: r9 = rs_c * sizeof(double) + + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c + prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c + prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c11 + 3*rs_c + prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c + prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // trsm computation begins here - " \n\t" - " \n\t" // Note: contents of b11 are stored as - " \n\t" // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) - " \n\t" // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) - " \n\t" // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) - " \n\t" // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) - " \n\t" // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) - " \n\t" // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - " \n\t" - " \n\t" - "movq %6, %%rax \n\t" // load address of a11 - " \n\t" - "movq %%r11, %%rcx \n\t" // recall address of b11 - "movq %%r14, %%rdx \n\t" // recall address of b11+4*cs_b - " \n\t" // Note: rdi = rs_b - " \n\t" - " \n\t" // iteration 0 ------------- - " \n\t" - "vbroadcastsd (0+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha00) - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // ymm4 *= (1/alpha00) - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" // ymm5 *= (1/alpha00) - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" // store ( beta00..beta03 ) = ymm4 - "vmovupd %%ymm5, (%%rdx) \n\t" // store ( beta04..beta07 ) = ymm5 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 1 ------------- - " \n\t" - "vbroadcastsd (1+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha10 - "vbroadcastsd (1+1*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha11) - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha10 * ymm4 - "vmulpd %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha10 * ymm5 - " \n\t" - "vsubpd %%ymm2, %%ymm6, %%ymm6 \n\t" // ymm6 -= ymm2 - "vsubpd %%ymm3, %%ymm7, %%ymm7 \n\t" // ymm7 -= ymm3 - " \n\t" - "vmulpd %%ymm6, %%ymm1, %%ymm6 \n\t" // ymm6 *= (1/alpha11) - "vmulpd %%ymm7, %%ymm1, %%ymm7 \n\t" // ymm7 *= (1/alpha11) - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" // store ( beta10..beta13 ) = ymm6 - "vmovupd %%ymm7, (%%rdx) \n\t" // store ( beta14..beta17 ) = ymm7 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 2 ------------- - " \n\t" - "vbroadcastsd (2+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha20 - "vbroadcastsd (2+1*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha21 - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha20 * ymm4 - "vmulpd %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha20 * ymm5 - " \n\t" - "vbroadcastsd (2+2*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha22) - " \n\t" - "vfmadd231pd %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha21 * ymm6 - "vfmadd231pd %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha21 * ymm7 - " \n\t" - "vsubpd %%ymm2, %%ymm8, %%ymm8 \n\t" // ymm8 -= ymm2 - "vsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" // ymm9 -= ymm3 - " \n\t" - "vmulpd %%ymm8, %%ymm0, %%ymm8 \n\t" // ymm8 *= (1/alpha22) - "vmulpd %%ymm9, %%ymm0, %%ymm9 \n\t" // ymm9 *= (1/alpha22) - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" // store ( beta20..beta23 ) = ymm8 - "vmovupd %%ymm9, (%%rdx) \n\t" // store ( beta24..beta27 ) = ymm9 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 3 ------------- - " \n\t" - "vbroadcastsd (3+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha30 - "vbroadcastsd (3+1*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha31 - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha30 * ymm4 - "vmulpd %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha30 * ymm5 - " \n\t" - "vbroadcastsd (3+2*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha32 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha31 * ymm6 - "vfmadd231pd %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha31 * ymm7 - " \n\t" - "vbroadcastsd (3+3*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha33) - " \n\t" - "vfmadd231pd %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha32 * ymm8 - "vfmadd231pd %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha32 * ymm9 - " \n\t" - "vsubpd %%ymm2, %%ymm10, %%ymm10 \n\t" // ymm10 -= ymm2 - "vsubpd %%ymm3, %%ymm11, %%ymm11 \n\t" // ymm11 -= ymm3 - " \n\t" - "vmulpd %%ymm10, %%ymm1, %%ymm10 \n\t" // ymm10 *= (1/alpha33) - "vmulpd %%ymm11, %%ymm1, %%ymm11 \n\t" // ymm11 *= (1/alpha33) - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" // store ( beta30..beta33 ) = ymm10 - "vmovupd %%ymm11, (%%rdx) \n\t" // store ( beta34..beta37 ) = ymm11 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 4 ------------- - " \n\t" - "vbroadcastsd (4+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha40 - "vbroadcastsd (4+1*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha41 - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha40 * ymm4 - "vmulpd %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha40 * ymm5 - " \n\t" - "vbroadcastsd (4+2*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha42 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha41 * ymm6 - "vfmadd231pd %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha41 * ymm7 - " \n\t" - "vbroadcastsd (4+3*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha43 - " \n\t" - "vfmadd231pd %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha42 * ymm8 - "vfmadd231pd %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha42 * ymm9 - " \n\t" - "vbroadcastsd (4+4*6)*8(%%rax), %%ymm0 \n\t" // ymm4 = (1/alpha44) - " \n\t" - "vfmadd231pd %%ymm1, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha43 * ymm10 - "vfmadd231pd %%ymm1, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha43 * ymm11 - " \n\t" - "vsubpd %%ymm2, %%ymm12, %%ymm12 \n\t" // ymm12 -= ymm2 - "vsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" // ymm13 -= ymm3 - " \n\t" - "vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // ymm12 *= (1/alpha44) - "vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // ymm13 *= (1/alpha44) - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" // store ( beta40..beta43 ) = ymm12 - "vmovupd %%ymm13, (%%rdx) \n\t" // store ( beta44..beta47 ) = ymm13 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" // iteration 5 ------------- - " \n\t" - "vbroadcastsd (5+0*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha50 - "vbroadcastsd (5+1*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha51 - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm2 \n\t" // ymm2 = alpha50 * ymm4 - "vmulpd %%ymm0, %%ymm5, %%ymm3 \n\t" // ymm3 = alpha50 * ymm5 - " \n\t" - "vbroadcastsd (5+2*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha52 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha51 * ymm6 - "vfmadd231pd %%ymm1, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha51 * ymm7 - " \n\t" - "vbroadcastsd (5+3*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha53 - " \n\t" - "vfmadd231pd %%ymm0, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha52 * ymm8 - "vfmadd231pd %%ymm0, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha52 * ymm9 - " \n\t" - "vbroadcastsd (5+4*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha54 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha53 * ymm10 - "vfmadd231pd %%ymm1, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha53 * ymm11 - " \n\t" - "vbroadcastsd (5+5*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha55) - " \n\t" - "vfmadd231pd %%ymm0, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha54 * ymm12 - "vfmadd231pd %%ymm0, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha54 * ymm13 - " \n\t" - "vsubpd %%ymm2, %%ymm14, %%ymm14 \n\t" // ymm14 -= ymm2 - "vsubpd %%ymm3, %%ymm15, %%ymm15 \n\t" // ymm15 -= ymm3 - " \n\t" - "vmulpd %%ymm14, %%ymm1, %%ymm14 \n\t" // ymm14 *= (1/alpha55) - "vmulpd %%ymm15, %%ymm1, %%ymm15 \n\t" // ymm15 *= (1/alpha55) - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" // store ( beta50..beta53 ) = ymm14 - "vmovupd %%ymm15, (%%rdx) \n\t" // store ( beta54..beta57 ) = ymm15 - "addq %%rdi, %%rcx \n\t" // rcx += rs_b - "addq %%rdi, %%rdx \n\t" // rdx += rs_b - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - "movq %%r9, %%rdi \n\t" // load rs_c (in bytes) from r9 - "movq %%r10, %%rsi \n\t" // load cs_c (in bytes) from r10 - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c11 + 4*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c11 + 4*rs_c; - " \n\t" - " \n\t" // These are used in the macros below. - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - "cmpq $8, %%rdi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - " \n\t" // if neither row- or column- - " \n\t" // stored, use general case. - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + + + + + // trsm computation begins here + + // Note: contents of b11 are stored as + // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) + // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) + // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) + // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) + // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) + // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) + + + mov(%6, rax) // load address of a11 + + mov(r11, rcx) // recall address of b11 + mov(r14, rdx) // recall address of b11+4*cs_b + // Note: rdi = rs_b + + // iteration 0 ------------- + + vbroadcastsd(mem(0+0*6)*8(rax), ymm0) // ymm0 = (1/alpha00) + + vmulpd(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) + vmulpd(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) + + vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 + vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 1 ------------- + + vbroadcastsd(mem(1+0*6)*8(rax), ymm0) // ymm0 = alpha10 + vbroadcastsd(mem(1+1*6)*8(rax), ymm1) // ymm1 = (1/alpha11) + + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 + vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 + + vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 + vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 + + vmulpd(ymm6, ymm1, ymm6) // ymm6 *= (1/alpha11) + vmulpd(ymm7, ymm1, ymm7) // ymm7 *= (1/alpha11) + + vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 + vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 2 ------------- + + vbroadcastsd(mem(2+0*6)*8(rax), ymm0) // ymm0 = alpha20 + vbroadcastsd(mem(2+1*6)*8(rax), ymm1) // ymm1 = alpha21 + + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 + vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 + + vbroadcastsd(mem(2+2*6)*8(rax), ymm0) // ymm0 = (1/alpha22) + + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 + vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 + + vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 + vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 + + vmulpd(ymm8, ymm0, ymm8) // ymm8 *= (1/alpha22) + vmulpd(ymm9, ymm0, ymm9) // ymm9 *= (1/alpha22) + + vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 + vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 3 ------------- + + vbroadcastsd(mem(3+0*6)*8(rax), ymm0) // ymm0 = alpha30 + vbroadcastsd(mem(3+1*6)*8(rax), ymm1) // ymm1 = alpha31 + + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 + vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 + + vbroadcastsd(mem(3+2*6)*8(rax), ymm0) // ymm0 = alpha32 + + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 + vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 + + vbroadcastsd(mem(3+3*6)*8(rax), ymm1) // ymm1 = (1/alpha33) + + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 + vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 + + vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 + vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 + + vmulpd(ymm10, ymm1, ymm10) // ymm10 *= (1/alpha33) + vmulpd(ymm11, ymm1, ymm11) // ymm11 *= (1/alpha33) + + vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 + vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 4 ------------- + + vbroadcastsd(mem(4+0*6)*8(rax), ymm0) // ymm0 = alpha40 + vbroadcastsd(mem(4+1*6)*8(rax), ymm1) // ymm1 = alpha41 + + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 + vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 + + vbroadcastsd(mem(4+2*6)*8(rax), ymm0) // ymm0 = alpha42 + + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 + vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 + + vbroadcastsd(mem(4+3*6)*8(rax), ymm1) // ymm1 = alpha43 + + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 + vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 + + vbroadcastsd(mem(4+4*6)*8(rax), ymm0) // ymm4 = (1/alpha44) + + vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 + vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 + + vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 + vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 + + vmulpd(ymm12, ymm0, ymm12) // ymm12 *= (1/alpha44) + vmulpd(ymm13, ymm0, ymm13) // ymm13 *= (1/alpha44) + + vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 + vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + // iteration 5 ------------- + + vbroadcastsd(mem(5+0*6)*8(rax), ymm0) // ymm0 = alpha50 + vbroadcastsd(mem(5+1*6)*8(rax), ymm1) // ymm1 = alpha51 + + vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 + vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 + + vbroadcastsd(mem(5+2*6)*8(rax), ymm0) // ymm0 = alpha52 + + vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 + vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 + + vbroadcastsd(mem(5+3*6)*8(rax), ymm1) // ymm1 = alpha53 + + vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 + vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 + + vbroadcastsd(mem(5+4*6)*8(rax), ymm0) // ymm0 = alpha54 + + vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 + vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 + + vbroadcastsd(mem(5+5*6)*8(rax), ymm1) // ymm1 = (1/alpha55) + + vfmadd231pd(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 + vfmadd231pd(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 + + vsubpd(ymm2, ymm14, ymm14) // ymm14 -= ymm2 + vsubpd(ymm3, ymm15, ymm15) // ymm15 -= ymm3 + + vmulpd(ymm14, ymm1, ymm14) // ymm14 *= (1/alpha55) + vmulpd(ymm15, ymm1, ymm15) // ymm15 *= (1/alpha55) + + vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 + vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 + add(rdi, rcx) // rcx += rs_b + add(rdi, rdx) // rdx += rs_b + + + + + mov(r8, rcx) // load address of c11 from r8 + mov(r9, rdi) // load rs_c (in bytes) from r9 + mov(r10, rsi) // load cs_c (in bytes) from r10 + + lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; + + // These are used in the macros below. + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DROWSTORED) // jump to row storage case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + // if neither row- or column- + // stored, use general case. + label(.DGENSTORED) + + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c11 + 4*cs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + + + mov(rdx, rcx) // rcx = c11 + 4*cs_c + + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "jmp .DDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORED: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhpd %%ymm6, %%ymm4, %%ymm1 \n\t" - "vunpcklpd %%ymm10, %%ymm8, %%ymm2 \n\t" - "vunpckhpd %%ymm10, %%ymm8, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm4 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm6 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm8 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm10 \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx ) \n\t" - "vmovupd %%ymm6, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm8, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm10, (%%rcx,%%r13 ) \n\t" - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhpd %%ymm14, %%ymm12, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm3, (%%r14,%%r13 ) \n\t" - " \n\t" - "leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhpd %%ymm7, %%ymm5, %%ymm1 \n\t" - "vunpcklpd %%ymm11, %%ymm9, %%ymm2 \n\t" - "vunpckhpd %%ymm11, %%ymm9, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm5 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm7 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm9 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm11 \n\t" - " \n\t" - "vmovupd %%ymm5, (%%rcx ) \n\t" - "vmovupd %%ymm7, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm9, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm11, (%%rcx,%%r13 ) \n\t" - " \n\t" - //"leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhpd %%ymm15, %%ymm13, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm3, (%%r14,%%r13 ) \n\t" - " \n\t" - //"leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + jmp(.DDONE) + + + + label(.DROWSTORED) + + + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + jmp(.DDONE) + + + + label(.DCOLSTORED) + + + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) + + lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm3, mem(r14, r13, 1)) + + lea(mem(r14, rsi, 4), r14) + + + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) + + //lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm3, mem(r14, r13, 1)) + + //lea(mem(r14, rsi, 4), r14) + + + + + + label(.DDONE) + + vzeroupper() + : // output operands (none) @@ -1510,3 +1513,4 @@ void bli_dgemmtrsm_l_zen_asm_6x8 } + diff --git a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c b/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c index d5040ec52..2cc742214 100644 --- a/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c +++ b/kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c @@ -35,22 +35,25 @@ #include "blis.h" +#define BLIS_ASM_SYNTAX_ATT +#include "bli_x86_asm_macros.h" + #define SGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ - "vmovss %%xmm0, (%%rcx ) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ - "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ - "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ - "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ - "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ - "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ - "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ - "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" + vextractf128(imm(1), ymm0, xmm2) \ + vmovss(xmm0, mem(rcx)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, rsi, 1)) \ + vpermilps(imm(0x39), xmm1, xmm0) \ + vmovss(xmm0, mem(rcx, rsi, 2)) \ + vpermilps(imm(0x39), xmm0, xmm1) \ + vmovss(xmm1, mem(rcx, r13, 1)) \ + vmovss(xmm2, mem(rcx, rsi, 4)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r15, 1)) \ + vpermilps(imm(0x39), xmm1, xmm2) \ + vmovss(xmm2, mem(rcx, r13, 2)) \ + vpermilps(imm(0x39), xmm2, xmm1) \ + vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemmtrsm_u_zen_asm_6x16 @@ -80,699 +83,699 @@ void bli_sgemmtrsm_u_zen_asm_6x16 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %7, %%rcx \n\t" // load address of b11 - "movq $16, %%rdi \n\t" // set rs_b = PACKNR = 16 - "leaq (,%%rdi,4), %%rdi \n\t" // rs_b *= sizeof(float) - " \n\t" - " \n\t" // NOTE: c11, rs_c, and cs_c aren't - " \n\t" // needed for a while, but we load - " \n\t" // them now to avoid stalling later. - "movq %8, %%r8 \n\t" // load address of c11 - "movq %9, %%r9 \n\t" // load rs_c - "leaq (,%%r9 ,4), %%r9 \n\t" // rs_c *= sizeof(float) - "movq %10, %%r10 \n\t" // load cs_c - "leaq (,%%r10,4), %%r10 \n\t" // cs_c *= sizeof(float) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".SLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 4, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 16 * 4, %%rbx \n\t" // b += 4*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".SLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 4(%%rax) \n\t" - " \n\t" - "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" - "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" - "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 4, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 16 * 4, %%rbx \n\t" // b += 1*16 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".SPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm4..ymm15 = -a10 * b01 - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of alpha - "vbroadcastss (%%rbx), %%ymm3 \n\t" // load alpha and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq $1, %%rsi \n\t" // load cs_b = 1 - "leaq (,%%rsi,4), %%rsi \n\t" // cs_b *= sizeof(float) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of b11 + 8*cs_b - " \n\t" - "movq %%rcx, %%r11 \n\t" // save rcx = b11 for later - "movq %%rdx, %%r14 \n\t" // save rdx = b11+8*cs_b for later - " \n\t" - " \n\t" - " \n\t" // b11 := alpha * b11 - a10 * b01 - "vfmsub231ps (%%rcx), %%ymm3, %%ymm4 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm5 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm6 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm7 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm8 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm9 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm10 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm11 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm12 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm13 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231ps (%%rcx), %%ymm3, %%ymm14 \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmsub231ps (%%rdx), %%ymm3, %%ymm15 \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // prefetch c11 - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + mov(%7, rcx) // load address of b11 + mov(imm(16), rdi) // set rs_b = PACKNR = 16 + lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) + + // NOTE: c11, rs_c, and cs_c aren't + // needed for a while, but we load + // them now to avoid stalling later. + mov(%8, r8) // load address of c11 + mov(%9, r9) // load rs_c + lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) + mov(%10, r10) // load cs_c + lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.SCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.SLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, -2*32), ymm0) + vmovaps(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastss(mem(rax, 6*4), ymm2) + vbroadcastss(mem(rax, 7*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 8*4), ymm2) + vbroadcastss(mem(rax, 9*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 10*4), ymm2) + vbroadcastss(mem(rax, 11*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 0*32), ymm0) + vmovaps(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*4)) + + vbroadcastss(mem(rax, 12*4), ymm2) + vbroadcastss(mem(rax, 13*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 14*4), ymm2) + vbroadcastss(mem(rax, 15*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 16*4), ymm2) + vbroadcastss(mem(rax, 17*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + vmovaps(mem(rbx, 2*32), ymm0) + vmovaps(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastss(mem(rax, 18*4), ymm2) + vbroadcastss(mem(rax, 19*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 20*4), ymm2) + vbroadcastss(mem(rax, 21*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 22*4), ymm2) + vbroadcastss(mem(rax, 23*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) + add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKITER) // iterate again if i != 0. + + + + + + + label(.SCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.SPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.SLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*4)) + + vbroadcastss(mem(rax, 0*4), ymm2) + vbroadcastss(mem(rax, 1*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm4) + vfmadd231ps(ymm1, ymm2, ymm5) + vfmadd231ps(ymm0, ymm3, ymm6) + vfmadd231ps(ymm1, ymm3, ymm7) + + vbroadcastss(mem(rax, 2*4), ymm2) + vbroadcastss(mem(rax, 3*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm8) + vfmadd231ps(ymm1, ymm2, ymm9) + vfmadd231ps(ymm0, ymm3, ymm10) + vfmadd231ps(ymm1, ymm3, ymm11) + + vbroadcastss(mem(rax, 4*4), ymm2) + vbroadcastss(mem(rax, 5*4), ymm3) + vfmadd231ps(ymm0, ymm2, ymm12) + vfmadd231ps(ymm1, ymm2, ymm13) + vfmadd231ps(ymm0, ymm3, ymm14) + vfmadd231ps(ymm1, ymm3, ymm15) + + add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) + add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) + + vmovaps(mem(rbx, -4*32), ymm0) + vmovaps(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.SLOOPKLEFT) // iterate again if i != 0. + + + + label(.SPOSTACCUM) + + // ymm4..ymm15 = -a10 * b01 + + + + mov(%5, rbx) // load address of alpha + vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate + + + + + mov(imm(1), rsi) // load cs_b = 1 + lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) + + lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b + + mov(rcx, r11) // save rcx = b11 for later + mov(rdx, r14) // save rdx = b11+8*cs_b for later + + + // b11 := alpha * b11 - a10 * b01 + vfmsub231ps(mem(rcx), ymm3, ymm4) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm5) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm6) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm7) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm8) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm9) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm10) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm11) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm12) + add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm13) + add(rdi, rdx) + + vfmsub231ps(mem(rcx), ymm3, ymm14) + //add(rdi, rcx) + vfmsub231ps(mem(rdx), ymm3, ymm15) + //add(rdi, rdx) + + + + // prefetch c11 + #if 0 - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - " \n\t" // Note: r9 = rs_c * sizeof(float) - " \n\t" - "leaq (%%r9 ,%%r9 ,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c11 + 3*rs_c; - " \n\t" - "prefetcht0 0 * 8(%%rcx) \n\t" // prefetch c11 + 0*rs_c - "prefetcht0 0 * 8(%%rcx,%%r9 ) \n\t" // prefetch c11 + 1*rs_c - "prefetcht0 0 * 8(%%rcx,%%r9 ,2) \n\t" // prefetch c11 + 2*rs_c - "prefetcht0 0 * 8(%%rdx) \n\t" // prefetch c11 + 3*rs_c - "prefetcht0 0 * 8(%%rdx,%%r9 ) \n\t" // prefetch c11 + 4*rs_c - "prefetcht0 0 * 8(%%rdx,%%r9 ,2) \n\t" // prefetch c11 + 5*rs_c + mov(r8, rcx) // load address of c11 from r8 + // Note: r9 = rs_c * sizeof(float) + + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; + + prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c + prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c + prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c + prefetch(0, mem(rdx, 0*8)) // prefetch c11 + 3*rs_c + prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c + prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // trsm computation begins here - " \n\t" - " \n\t" // Note: contents of b11 are stored as - " \n\t" // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) - " \n\t" // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) - " \n\t" // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) - " \n\t" // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) - " \n\t" // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) - " \n\t" // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) - " \n\t" - " \n\t" - "movq %6, %%rax \n\t" // load address of a11 - " \n\t" - "movq %%r11, %%rcx \n\t" // recall address of b11 - "movq %%r14, %%rdx \n\t" // recall address of b11+8*cs_b - " \n\t" - "leaq (%%rcx,%%rdi,4), %%rcx \n\t" // rcx = b11 + (6-1)*rs_b - "leaq (%%rcx,%%rdi,1), %%rcx \n\t" - "leaq (%%rdx,%%rdi,4), %%rdx \n\t" // rdx = b11 + (6-1)*rs_b + 8*cs_b - "leaq (%%rdx,%%rdi,1), %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 ------------- - " \n\t" - "vbroadcastss (5+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha55) - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" // ymm14 *= (1/alpha55) - "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" // ymm15 *= (1/alpha55) - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" // store ( beta50..beta57 ) = ymm14 - "vmovups %%ymm15, (%%rdx) \n\t" // store ( beta58..beta5F ) = ymm15 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 1 ------------- - " \n\t" - "vbroadcastss (4+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha45 - "vbroadcastss (4+4*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha44) - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha45 * ymm14 - "vmulps %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha45 * ymm15 - " \n\t" - "vsubps %%ymm2, %%ymm12, %%ymm12 \n\t" // ymm12 -= ymm2 - "vsubps %%ymm3, %%ymm13, %%ymm13 \n\t" // ymm13 -= ymm3 - " \n\t" - "vmulps %%ymm12, %%ymm1, %%ymm12 \n\t" // ymm12 *= (1/alpha44) - "vmulps %%ymm13, %%ymm1, %%ymm13 \n\t" // ymm13 *= (1/alpha44) - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" // store ( beta40..beta47 ) = ymm12 - "vmovups %%ymm13, (%%rdx) \n\t" // store ( beta48..beta4F ) = ymm13 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 2 ------------- - " \n\t" - "vbroadcastss (3+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha35 - "vbroadcastss (3+4*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha34 - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha35 * ymm14 - "vmulps %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha35 * ymm15 - " \n\t" - "vbroadcastss (3+3*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha33) - " \n\t" - "vfmadd231ps %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha34 * ymm12 - "vfmadd231ps %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha34 * ymm13 - " \n\t" - "vsubps %%ymm2, %%ymm10, %%ymm10 \n\t" // ymm10 -= ymm2 - "vsubps %%ymm3, %%ymm11, %%ymm11 \n\t" // ymm11 -= ymm3 - " \n\t" - "vmulps %%ymm10, %%ymm0, %%ymm10 \n\t" // ymm10 *= (1/alpha33) - "vmulps %%ymm11, %%ymm0, %%ymm11 \n\t" // ymm11 *= (1/alpha33) - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" // store ( beta30..beta37 ) = ymm10 - "vmovups %%ymm11, (%%rdx) \n\t" // store ( beta38..beta3F ) = ymm11 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 3 ------------- - " \n\t" - "vbroadcastss (2+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha25 - "vbroadcastss (2+4*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha24 - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha25 * ymm14 - "vmulps %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha25 * ymm15 - " \n\t" - "vbroadcastss (2+3*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha23 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha24 * ymm12 - "vfmadd231ps %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha24 * ymm13 - " \n\t" - "vbroadcastss (2+2*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha22) - " \n\t" - "vfmadd231ps %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha23 * ymm10 - "vfmadd231ps %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha23 * ymm11 - " \n\t" - "vsubps %%ymm2, %%ymm8, %%ymm8 \n\t" // ymm8 -= ymm2 - "vsubps %%ymm3, %%ymm9, %%ymm9 \n\t" // ymm9 -= ymm3 - " \n\t" - "vmulps %%ymm8, %%ymm1, %%ymm8 \n\t" // ymm8 *= (1/alpha33) - "vmulps %%ymm9, %%ymm1, %%ymm9 \n\t" // ymm9 *= (1/alpha33) - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" // store ( beta20..beta27 ) = ymm8 - "vmovups %%ymm9, (%%rdx) \n\t" // store ( beta28..beta2F ) = ymm9 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 4 ------------- - " \n\t" - "vbroadcastss (1+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha15 - "vbroadcastss (1+4*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha14 - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha15 * ymm14 - "vmulps %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha15 * ymm15 - " \n\t" - "vbroadcastss (1+3*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha13 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha14 * ymm12 - "vfmadd231ps %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha14 * ymm13 - " \n\t" - "vbroadcastss (1+2*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha12 - " \n\t" - "vfmadd231ps %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha13 * ymm10 - "vfmadd231ps %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha13 * ymm11 - " \n\t" - "vbroadcastss (1+1*6)*4(%%rax), %%ymm0 \n\t" // ymm4 = (1/alpha11) - " \n\t" - "vfmadd231ps %%ymm1, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha12 * ymm8 - "vfmadd231ps %%ymm1, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha12 * ymm9 - " \n\t" - "vsubps %%ymm2, %%ymm6, %%ymm6 \n\t" // ymm6 -= ymm2 - "vsubps %%ymm3, %%ymm7, %%ymm7 \n\t" // ymm7 -= ymm3 - " \n\t" - "vmulps %%ymm6, %%ymm0, %%ymm6 \n\t" // ymm6 *= (1/alpha44) - "vmulps %%ymm7, %%ymm0, %%ymm7 \n\t" // ymm7 *= (1/alpha44) - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" // store ( beta10..beta17 ) = ymm6 - "vmovups %%ymm7, (%%rdx) \n\t" // store ( beta18..beta1F ) = ymm7 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 5 ------------- - " \n\t" - "vbroadcastss (0+5*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha05 - "vbroadcastss (0+4*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha04 - " \n\t" - "vmulps %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha05 * ymm14 - "vmulps %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha05 * ymm15 - " \n\t" - "vbroadcastss (0+3*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha03 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha04 * ymm12 - "vfmadd231ps %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha04 * ymm13 - " \n\t" - "vbroadcastss (0+2*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = alpha02 - " \n\t" - "vfmadd231ps %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha03 * ymm10 - "vfmadd231ps %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha03 * ymm11 - " \n\t" - "vbroadcastss (0+1*6)*4(%%rax), %%ymm0 \n\t" // ymm0 = alpha01 - " \n\t" - "vfmadd231ps %%ymm1, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha02 * ymm8 - "vfmadd231ps %%ymm1, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha02 * ymm9 - " \n\t" - "vbroadcastss (0+0*6)*4(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha00) - " \n\t" - "vfmadd231ps %%ymm0, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha01 * ymm6 - "vfmadd231ps %%ymm0, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha01 * ymm7 - " \n\t" - "vsubps %%ymm2, %%ymm4, %%ymm4 \n\t" // ymm4 -= ymm2 - "vsubps %%ymm3, %%ymm5, %%ymm5 \n\t" // ymm5 -= ymm3 - " \n\t" - "vmulps %%ymm4, %%ymm1, %%ymm4 \n\t" // ymm4 *= (1/alpha00) - "vmulps %%ymm5, %%ymm1, %%ymm5 \n\t" // ymm5 *= (1/alpha00) - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" // store ( beta00..beta07 ) = ymm4 - "vmovups %%ymm5, (%%rdx) \n\t" // store ( beta08..beta0F ) = ymm5 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - "movq %%r9, %%rdi \n\t" // load rs_c (in bytes) from r9 - "movq %%r10, %%rsi \n\t" // load cs_c (in bytes) from r10 - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c11 + 8*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c11 + 4*rs_c; - " \n\t" - " \n\t" // These are used in the macros below. - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "jz .SROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - "cmpq $4, %%rdi \n\t" // set ZF if (4*rs_c) == 4. - "jz .SCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - " \n\t" // if neither row- or column- - " \n\t" // stored, use general case. - ".SGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" + + + + + // trsm computation begins here + + // Note: contents of b11 are stored as + // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) + // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) + // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) + // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) + // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) + // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) + + + mov(%6, rax) // load address of a11 + + mov(r11, rcx) // recall address of b11 + mov(r14, rdx) // recall address of b11+8*cs_b + + lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b + lea(mem(rcx, rdi, 1), rcx) + lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 8*cs_b + lea(mem(rdx, rdi, 1), rdx) + + + // iteration 0 ------------- + + vbroadcastss(mem(5+5*6)*4(rax), ymm0) // ymm0 = (1/alpha55) + + vmulps(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) + vmulps(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) + + vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 + vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 1 ------------- + + vbroadcastss(mem(4+5*6)*4(rax), ymm0) // ymm0 = alpha45 + vbroadcastss(mem(4+4*6)*4(rax), ymm1) // ymm1 = (1/alpha44) + + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 + vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 + + vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 + vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 + + vmulps(ymm12, ymm1, ymm12) // ymm12 *= (1/alpha44) + vmulps(ymm13, ymm1, ymm13) // ymm13 *= (1/alpha44) + + vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 + vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 2 ------------- + + vbroadcastss(mem(3+5*6)*4(rax), ymm0) // ymm0 = alpha35 + vbroadcastss(mem(3+4*6)*4(rax), ymm1) // ymm1 = alpha34 + + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 + vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 + + vbroadcastss(mem(3+3*6)*4(rax), ymm0) // ymm0 = (1/alpha33) + + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 + vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 + + vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 + vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 + + vmulps(ymm10, ymm0, ymm10) // ymm10 *= (1/alpha33) + vmulps(ymm11, ymm0, ymm11) // ymm11 *= (1/alpha33) + + vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 + vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 3 ------------- + + vbroadcastss(mem(2+5*6)*4(rax), ymm0) // ymm0 = alpha25 + vbroadcastss(mem(2+4*6)*4(rax), ymm1) // ymm1 = alpha24 + + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 + vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 + + vbroadcastss(mem(2+3*6)*4(rax), ymm0) // ymm0 = alpha23 + + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 + vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 + + vbroadcastss(mem(2+2*6)*4(rax), ymm1) // ymm1 = (1/alpha22) + + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 + vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 + + vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 + vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 + + vmulps(ymm8, ymm1, ymm8) // ymm8 *= (1/alpha33) + vmulps(ymm9, ymm1, ymm9) // ymm9 *= (1/alpha33) + + vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 + vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 4 ------------- + + vbroadcastss(mem(1+5*6)*4(rax), ymm0) // ymm0 = alpha15 + vbroadcastss(mem(1+4*6)*4(rax), ymm1) // ymm1 = alpha14 + + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 + vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 + + vbroadcastss(mem(1+3*6)*4(rax), ymm0) // ymm0 = alpha13 + + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 + vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 + + vbroadcastss(mem(1+2*6)*4(rax), ymm1) // ymm1 = alpha12 + + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 + vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 + + vbroadcastss(mem(1+1*6)*4(rax), ymm0) // ymm4 = (1/alpha11) + + vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 + vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 + + vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 + vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 + + vmulps(ymm6, ymm0, ymm6) // ymm6 *= (1/alpha44) + vmulps(ymm7, ymm0, ymm7) // ymm7 *= (1/alpha44) + + vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 + vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 5 ------------- + + vbroadcastss(mem(0+5*6)*4(rax), ymm0) // ymm0 = alpha05 + vbroadcastss(mem(0+4*6)*4(rax), ymm1) // ymm1 = alpha04 + + vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 + vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 + + vbroadcastss(mem(0+3*6)*4(rax), ymm0) // ymm0 = alpha03 + + vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 + vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 + + vbroadcastss(mem(0+2*6)*4(rax), ymm1) // ymm1 = alpha02 + + vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 + vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 + + vbroadcastss(mem(0+1*6)*4(rax), ymm0) // ymm0 = alpha01 + + vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 + vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 + + vbroadcastss(mem(0+0*6)*4(rax), ymm1) // ymm1 = (1/alpha00) + + vfmadd231ps(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 + vfmadd231ps(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 + + vsubps(ymm2, ymm4, ymm4) // ymm4 -= ymm2 + vsubps(ymm3, ymm5, ymm5) // ymm5 -= ymm3 + + vmulps(ymm4, ymm1, ymm4) // ymm4 *= (1/alpha00) + vmulps(ymm5, ymm1, ymm5) // ymm5 *= (1/alpha00) + + vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 + vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + + + + + mov(r8, rcx) // load address of c11 from r8 + mov(r9, rdi) // load rs_c (in bytes) from r9 + mov(r10, rsi) // load cs_c (in bytes) from r10 + + lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; + + // These are used in the macros below. + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SROWSTORED) // jump to row storage case + + + + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + + + // if neither row- or column- + // stored, use general case. + label(.SGENSTORED) + + + vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c11 + 8*cs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" + + + mov(rdx, rcx) // rcx = c11 + 8*cs_c + + + vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SROWSTORED: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "jmp .SDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vunpcklps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpcklps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma00..gamma30 ) - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma01..gamma31 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma04..gamma34 ) - "vmovups %%xmm3, (%%rcx,%%r15 ) \n\t" // store ( gamma05..gamma35 ) - " \n\t" - " \n\t" - "vunpckhps %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhps %%ymm10, %%ymm8, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma02..gamma32 ) - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma03..gamma33 ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma06..gamma36 ) - "vmovups %%xmm3, (%%rcx,%%r10 ) \n\t" // store ( gamma07..gamma37 ) - " \n\t" - "leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhps %%ymm14, %%ymm12, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma40..gamma50 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma41..gamma51 ) - "vmovlpd %%xmm1, (%%r14,%%rsi,2) \n\t" // store ( gamma42..gamma52 ) - "vmovhpd %%xmm1, (%%r14,%%r13 ) \n\t" // store ( gamma43..gamma53 ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma44..gamma54 ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma45..gamma55 ) - "vmovlpd %%xmm3, (%%r14,%%r13,2) \n\t" // store ( gamma46..gamma56 ) - "vmovhpd %%xmm3, (%%r14,%%r10 ) \n\t" // store ( gamma47..gamma57 ) - " \n\t" - "leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - "vunpcklps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpcklps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx ) \n\t" // store ( gamma08..gamma38 ) - "vmovups %%xmm1, (%%rcx,%%rsi,1) \n\t" // store ( gamma09..gamma39 ) - "vmovups %%xmm2, (%%rcx,%%rsi,4) \n\t" // store ( gamma0C..gamma3C ) - "vmovups %%xmm3, (%%rcx,%%r15 ) \n\t" // store ( gamma0D..gamma3D ) - " \n\t" - "vunpckhps %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhps %%ymm11, %%ymm9, %%ymm1 \n\t" - "vshufps $0x4e, %%ymm1, %%ymm0, %%ymm2 \n\t" - "vblendps $0xcc, %%ymm2, %%ymm0, %%ymm0 \n\t" - "vblendps $0x33, %%ymm2, %%ymm1, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovups %%xmm0, (%%rcx,%%rsi,2) \n\t" // store ( gamma0A..gamma3A ) - "vmovups %%xmm1, (%%rcx,%%r13 ) \n\t" // store ( gamma0B..gamma3B ) - "vmovups %%xmm2, (%%rcx,%%r13,2) \n\t" // store ( gamma0E..gamma3E ) - "vmovups %%xmm3, (%%rcx,%%r10 ) \n\t" // store ( gamma0F..gamma3F ) - " \n\t" - //"leaq (%%rcx,%%rsi,8), %%rcx \n\t" // rcx += 8*cs_c - " \n\t" - "vunpcklps %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhps %%ymm15, %%ymm13, %%ymm1 \n\t" - " \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovlpd %%xmm0, (%%r14 ) \n\t" // store ( gamma48..gamma58 ) - "vmovhpd %%xmm0, (%%r14,%%rsi,1) \n\t" // store ( gamma49..gamma59 ) - "vmovlpd %%xmm1, (%%r14,%%rsi,2) \n\t" // store ( gamma4A..gamma5A ) - "vmovhpd %%xmm1, (%%r14,%%r13 ) \n\t" // store ( gamma4B..gamma5B ) - "vmovlpd %%xmm2, (%%r14,%%rsi,4) \n\t" // store ( gamma4C..gamma5C ) - "vmovhpd %%xmm2, (%%r14,%%r15 ) \n\t" // store ( gamma4D..gamma5D ) - "vmovlpd %%xmm3, (%%r14,%%r13,2) \n\t" // store ( gamma4E..gamma5E ) - "vmovhpd %%xmm3, (%%r14,%%r10 ) \n\t" // store ( gamma4F..gamma5F ) - " \n\t" - //"leaq (%%r14,%%rsi,8), %%r14 \n\t" // r14 += 8*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".SDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + + jmp(.SDONE) + + + + label(.SROWSTORED) + + + vmovups(ymm4, mem(rcx)) + add(rdi, rcx) + vmovups(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm6, mem(rcx)) + add(rdi, rcx) + vmovups(ymm7, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm8, mem(rcx)) + add(rdi, rcx) + vmovups(ymm9, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm10, mem(rcx)) + add(rdi, rcx) + vmovups(ymm11, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm12, mem(rcx)) + add(rdi, rcx) + vmovups(ymm13, mem(rdx)) + add(rdi, rdx) + + vmovups(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovups(ymm15, mem(rdx)) + //add(rdi, rdx) + + + jmp(.SDONE) + + + + label(.SCOLSTORED) + + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm14, ymm12, ymm0) + vunpckhps(ymm14, ymm12, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) + vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) + vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm15, ymm13, ymm0) + vunpckhps(ymm15, ymm13, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) + vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) + vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma4B..gamma5B ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma4C..gamma5C ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) + vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) + vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) + + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + + label(.SDONE) + + vzeroupper() + : // output operands (none) : // input operands @@ -801,16 +804,16 @@ void bli_sgemmtrsm_u_zen_asm_6x16 #define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ + vextractf128(imm(1), ymm0, xmm1) \ + vmovlpd(xmm0, mem(rcx)) \ + vmovhpd(xmm0, mem(rcx, rsi, 1)) \ + vmovlpd(xmm1, mem(rcx, rsi, 2)) \ + vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ + vextractf128(imm(1), ymm2, xmm1) \ + vmovlpd(xmm2, mem(rcx, rsi, 4)) \ + vmovhpd(xmm2, mem(rcx, r15, 1)) \ + vmovlpd(xmm1, mem(rcx, r13, 2)) \ + vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_u_zen_asm_6x8 ( @@ -839,660 +842,660 @@ void bli_dgemmtrsm_u_zen_asm_6x8 __asm__ volatile ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %7, %%rcx \n\t" // load address of b11 - "movq $8, %%rdi \n\t" // set rs_b = PACKNR = 8 - "leaq (,%%rdi,8), %%rdi \n\t" // rs_b *= sizeof(double) - " \n\t" - " \n\t" // NOTE: c11, rs_c, and cs_c aren't - " \n\t" // needed for a while, but we load - " \n\t" // them now to avoid stalling later. - "movq %8, %%r8 \n\t" // load address of c11 - "movq %9, %%r9 \n\t" // load rs_c - "leaq (,%%r9 ,8), %%r9 \n\t" // rs_c *= sizeof(double) - "movq %10, %%r10 \n\t" // load cs_c - "leaq (,%%r10,8), %%r10 \n\t" // cs_c *= sizeof(double) - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) - " \n\t" - "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" // ymm4..ymm15 = -a10 * b01 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %5, %%rbx \n\t" // load address of alpha - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load alpha and duplicate - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq $1, %%rsi \n\t" // set cs_b = 1 - "leaq (,%%rsi,8), %%rsi \n\t" // cs_b *= sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of b11 + 4*cs_b - " \n\t" - "movq %%rcx, %%r11 \n\t" // save rcx = b11 for later - "movq %%rdx, %%r14 \n\t" // save rdx = b11+4*cs_b for later - " \n\t" - " \n\t" - " \n\t" // b11 := alpha * b11 - a10 * b01 - "vfmsub231pd (%%rcx), %%ymm3, %%ymm4 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm5 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm6 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm7 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm8 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm9 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm10 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm11 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm12 \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm13 \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vfmsub231pd (%%rcx), %%ymm3, %%ymm14 \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmsub231pd (%%rdx), %%ymm3, %%ymm15 \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // prefetch c11 - " \n\t" + + vzeroall() // zero all xmm/ymm registers. + + + mov(%2, rax) // load address of a. + mov(%3, rbx) // load address of b. + + add(imm(32*4), rbx) + // initialize loop by pre-loading + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + mov(%7, rcx) // load address of b11 + mov(imm(8), rdi) // set rs_b = PACKNR = 8 + lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) + + // NOTE: c11, rs_c, and cs_c aren't + // needed for a while, but we load + // them now to avoid stalling later. + mov(%8, r8) // load address of c11 + mov(%9, r9) // load rs_c + lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) + mov(%10, r10) // load cs_c + lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) + + + + mov(%0, rsi) // i = k_iter; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT) // if i == 0, jump to code that + // contains the k_left loop. + + + label(.DLOOPKITER) // MAIN LOOP + + + // iteration 0 + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, -2*32), ymm0) + vmovapd(mem(rbx, -1*32), ymm1) + + // iteration 1 + vbroadcastsd(mem(rax, 6*8), ymm2) + vbroadcastsd(mem(rax, 7*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 8*8), ymm2) + vbroadcastsd(mem(rax, 9*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 10*8), ymm2) + vbroadcastsd(mem(rax, 11*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 0*32), ymm0) + vmovapd(mem(rbx, 1*32), ymm1) + + // iteration 2 + prefetch(0, mem(rax, 76*8)) + + vbroadcastsd(mem(rax, 12*8), ymm2) + vbroadcastsd(mem(rax, 13*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 14*8), ymm2) + vbroadcastsd(mem(rax, 15*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 16*8), ymm2) + vbroadcastsd(mem(rax, 17*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + vmovapd(mem(rbx, 2*32), ymm0) + vmovapd(mem(rbx, 3*32), ymm1) + + // iteration 3 + vbroadcastsd(mem(rax, 18*8), ymm2) + vbroadcastsd(mem(rax, 19*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 20*8), ymm2) + vbroadcastsd(mem(rax, 21*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 22*8), ymm2) + vbroadcastsd(mem(rax, 23*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) + add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKITER) // iterate again if i != 0. + + + + + + + label(.DCONSIDKLEFT) + + mov(%1, rsi) // i = k_left; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left loop. + + + label(.DLOOPKLEFT) // EDGE LOOP + + prefetch(0, mem(rax, 64*8)) + + vbroadcastsd(mem(rax, 0*8), ymm2) + vbroadcastsd(mem(rax, 1*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm4) + vfmadd231pd(ymm1, ymm2, ymm5) + vfmadd231pd(ymm0, ymm3, ymm6) + vfmadd231pd(ymm1, ymm3, ymm7) + + vbroadcastsd(mem(rax, 2*8), ymm2) + vbroadcastsd(mem(rax, 3*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm8) + vfmadd231pd(ymm1, ymm2, ymm9) + vfmadd231pd(ymm0, ymm3, ymm10) + vfmadd231pd(ymm1, ymm3, ymm11) + + vbroadcastsd(mem(rax, 4*8), ymm2) + vbroadcastsd(mem(rax, 5*8), ymm3) + vfmadd231pd(ymm0, ymm2, ymm12) + vfmadd231pd(ymm1, ymm2, ymm13) + vfmadd231pd(ymm0, ymm3, ymm14) + vfmadd231pd(ymm1, ymm3, ymm15) + + add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) + add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) + + vmovapd(mem(rbx, -4*32), ymm0) + vmovapd(mem(rbx, -3*32), ymm1) + + + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT) // iterate again if i != 0. + + + + label(.DPOSTACCUM) + + // ymm4..ymm15 = -a10 * b01 + + + + + mov(%5, rbx) // load address of alpha + vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate + + + + + mov(imm(1), rsi) // set cs_b = 1 + lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) + + lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b + + mov(rcx, r11) // save rcx = b11 for later + mov(rdx, r14) // save rdx = b11+4*cs_b for later + + + // b11 := alpha * b11 - a10 * b01 + vfmsub231pd(mem(rcx), ymm3, ymm4) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm5) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm6) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm7) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm8) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm9) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm10) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm11) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm12) + add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm13) + add(rdi, rdx) + + vfmsub231pd(mem(rcx), ymm3, ymm14) + //add(rdi, rcx) + vfmsub231pd(mem(rdx), ymm3, ymm15) + //add(rdi, rdx) + + + + // prefetch c11 + #if 0 - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - " \n\t" // Note: r9 = rs_c * sizeof(double) - " \n\t" - "leaq (%%r9 ,%%r9 ,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c11 + 3*rs_c; - " \n\t" - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c11 + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%r9 ) \n\t" // prefetch c11 + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%r9 ,2) \n\t" // prefetch c11 + 2*rs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c11 + 3*rs_c - "prefetcht0 7 * 8(%%rdx,%%r9 ) \n\t" // prefetch c11 + 4*rs_c - "prefetcht0 7 * 8(%%rdx,%%r9 ,2) \n\t" // prefetch c11 + 5*rs_c + mov(r8, rcx) // load address of c11 from r8 + // Note: r9 = rs_c * sizeof(double) + + lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; + + prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c + prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c + prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c11 + 3*rs_c + prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c + prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // trsm computation begins here - " \n\t" - " \n\t" // Note: contents of b11 are stored as - " \n\t" // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) - " \n\t" // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) - " \n\t" // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) - " \n\t" // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) - " \n\t" // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) - " \n\t" // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) - " \n\t" - " \n\t" - "movq %6, %%rax \n\t" // load address of a11 - " \n\t" - "movq %%r11, %%rcx \n\t" // recall address of b11 - "movq %%r14, %%rdx \n\t" // recall address of b11+4*cs_b - " \n\t" - "leaq (%%rcx,%%rdi,4), %%rcx \n\t" // rcx = b11 + (6-1)*rs_b - "leaq (%%rcx,%%rdi,1), %%rcx \n\t" - "leaq (%%rdx,%%rdi,4), %%rdx \n\t" // rdx = b11 + (6-1)*rs_b + 4*cs_b - "leaq (%%rdx,%%rdi,1), %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 0 ------------- - " \n\t" - "vbroadcastsd (5+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha55) - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" // ymm14 *= (1/alpha55) - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" // ymm15 *= (1/alpha55) - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" // store ( beta50..beta53 ) = ymm14 - "vmovupd %%ymm15, (%%rdx) \n\t" // store ( beta54..beta57 ) = ymm15 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 1 ------------- - " \n\t" - "vbroadcastsd (4+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha45 - "vbroadcastsd (4+4*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha44) - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha45 * ymm14 - "vmulpd %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha45 * ymm15 - " \n\t" - "vsubpd %%ymm2, %%ymm12, %%ymm12 \n\t" // ymm12 -= ymm2 - "vsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" // ymm13 -= ymm3 - " \n\t" - "vmulpd %%ymm12, %%ymm1, %%ymm12 \n\t" // ymm12 *= (1/alpha44) - "vmulpd %%ymm13, %%ymm1, %%ymm13 \n\t" // ymm13 *= (1/alpha44) - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" // store ( beta40..beta43 ) = ymm12 - "vmovupd %%ymm13, (%%rdx) \n\t" // store ( beta44..beta47 ) = ymm13 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 2 ------------- - " \n\t" - "vbroadcastsd (3+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha35 - "vbroadcastsd (3+4*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha34 - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha35 * ymm14 - "vmulpd %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha35 * ymm15 - " \n\t" - "vbroadcastsd (3+3*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = (1/alpha33) - " \n\t" - "vfmadd231pd %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha34 * ymm12 - "vfmadd231pd %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha34 * ymm13 - " \n\t" - "vsubpd %%ymm2, %%ymm10, %%ymm10 \n\t" // ymm10 -= ymm2 - "vsubpd %%ymm3, %%ymm11, %%ymm11 \n\t" // ymm11 -= ymm3 - " \n\t" - "vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // ymm10 *= (1/alpha33) - "vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // ymm11 *= (1/alpha33) - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" // store ( beta30..beta33 ) = ymm10 - "vmovupd %%ymm11, (%%rdx) \n\t" // store ( beta34..beta37 ) = ymm11 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 3 ------------- - " \n\t" - "vbroadcastsd (2+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha25 - "vbroadcastsd (2+4*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha24 - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha25 * ymm14 - "vmulpd %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha25 * ymm15 - " \n\t" - "vbroadcastsd (2+3*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha23 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha24 * ymm12 - "vfmadd231pd %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha24 * ymm13 - " \n\t" - "vbroadcastsd (2+2*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha22) - " \n\t" - "vfmadd231pd %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha23 * ymm10 - "vfmadd231pd %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha23 * ymm11 - " \n\t" - "vsubpd %%ymm2, %%ymm8, %%ymm8 \n\t" // ymm8 -= ymm2 - "vsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" // ymm9 -= ymm3 - " \n\t" - "vmulpd %%ymm8, %%ymm1, %%ymm8 \n\t" // ymm8 *= (1/alpha33) - "vmulpd %%ymm9, %%ymm1, %%ymm9 \n\t" // ymm9 *= (1/alpha33) - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" // store ( beta20..beta23 ) = ymm8 - "vmovupd %%ymm9, (%%rdx) \n\t" // store ( beta24..beta27 ) = ymm9 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 4 ------------- - " \n\t" - "vbroadcastsd (1+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha15 - "vbroadcastsd (1+4*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha14 - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha15 * ymm14 - "vmulpd %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha15 * ymm15 - " \n\t" - "vbroadcastsd (1+3*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha13 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha14 * ymm12 - "vfmadd231pd %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha14 * ymm13 - " \n\t" - "vbroadcastsd (1+2*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha12 - " \n\t" - "vfmadd231pd %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha13 * ymm10 - "vfmadd231pd %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha13 * ymm11 - " \n\t" - "vbroadcastsd (1+1*6)*8(%%rax), %%ymm0 \n\t" // ymm4 = (1/alpha11) - " \n\t" - "vfmadd231pd %%ymm1, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha12 * ymm8 - "vfmadd231pd %%ymm1, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha12 * ymm9 - " \n\t" - "vsubpd %%ymm2, %%ymm6, %%ymm6 \n\t" // ymm6 -= ymm2 - "vsubpd %%ymm3, %%ymm7, %%ymm7 \n\t" // ymm7 -= ymm3 - " \n\t" - "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" // ymm6 *= (1/alpha44) - "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" // ymm7 *= (1/alpha44) - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" // store ( beta10..beta13 ) = ymm6 - "vmovupd %%ymm7, (%%rdx) \n\t" // store ( beta14..beta17 ) = ymm7 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" // iteration 5 ------------- - " \n\t" - "vbroadcastsd (0+5*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha05 - "vbroadcastsd (0+4*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha04 - " \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm2 \n\t" // ymm2 = alpha05 * ymm14 - "vmulpd %%ymm0, %%ymm15, %%ymm3 \n\t" // ymm3 = alpha05 * ymm15 - " \n\t" - "vbroadcastsd (0+3*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha03 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm12, %%ymm2 \n\t" // ymm2 += alpha04 * ymm12 - "vfmadd231pd %%ymm1, %%ymm13, %%ymm3 \n\t" // ymm3 += alpha04 * ymm13 - " \n\t" - "vbroadcastsd (0+2*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = alpha02 - " \n\t" - "vfmadd231pd %%ymm0, %%ymm10, %%ymm2 \n\t" // ymm2 += alpha03 * ymm10 - "vfmadd231pd %%ymm0, %%ymm11, %%ymm3 \n\t" // ymm3 += alpha03 * ymm11 - " \n\t" - "vbroadcastsd (0+1*6)*8(%%rax), %%ymm0 \n\t" // ymm0 = alpha01 - " \n\t" - "vfmadd231pd %%ymm1, %%ymm8, %%ymm2 \n\t" // ymm2 += alpha02 * ymm8 - "vfmadd231pd %%ymm1, %%ymm9, %%ymm3 \n\t" // ymm3 += alpha02 * ymm9 - " \n\t" - "vbroadcastsd (0+0*6)*8(%%rax), %%ymm1 \n\t" // ymm1 = (1/alpha00) - " \n\t" - "vfmadd231pd %%ymm0, %%ymm6, %%ymm2 \n\t" // ymm2 += alpha01 * ymm6 - "vfmadd231pd %%ymm0, %%ymm7, %%ymm3 \n\t" // ymm3 += alpha01 * ymm7 - " \n\t" - "vsubpd %%ymm2, %%ymm4, %%ymm4 \n\t" // ymm4 -= ymm2 - "vsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" // ymm5 -= ymm3 - " \n\t" - "vmulpd %%ymm4, %%ymm1, %%ymm4 \n\t" // ymm4 *= (1/alpha00) - "vmulpd %%ymm5, %%ymm1, %%ymm5 \n\t" // ymm5 *= (1/alpha00) - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" // store ( beta00..beta03 ) = ymm4 - "vmovupd %%ymm5, (%%rdx) \n\t" // store ( beta04..beta07 ) = ymm5 - "subq %%rdi, %%rcx \n\t" // rcx -= rs_b - "subq %%rdi, %%rdx \n\t" // rdx -= rs_b - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %%r8, %%rcx \n\t" // load address of c11 from r8 - "movq %%r9, %%rdi \n\t" // load rs_c (in bytes) from r9 - "movq %%r10, %%rsi \n\t" // load cs_c (in bytes) from r10 - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c11 + 4*cs_c; - "leaq (%%rcx,%%rdi,4), %%r14 \n\t" // load address of c11 + 4*rs_c; - " \n\t" - " \n\t" // These are used in the macros below. - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - "cmpq $8, %%rdi \n\t" // set ZF if (8*rs_c) == 8. - "jz .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - " \n\t" // if neither row- or column- - " \n\t" // stored, use general case. - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" + + + + + // trsm computation begins here + + // Note: contents of b11 are stored as + // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) + // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) + // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) + // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) + // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) + // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) + + + mov(%6, rax) // load address of a11 + + mov(r11, rcx) // recall address of b11 + mov(r14, rdx) // recall address of b11+4*cs_b + + lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b + lea(mem(rcx, rdi, 1), rcx) + lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 4*cs_b + lea(mem(rdx, rdi, 1), rdx) + + + // iteration 0 ------------- + + vbroadcastsd(mem(5+5*6)*8(rax), ymm0) // ymm0 = (1/alpha55) + + vmulpd(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) + vmulpd(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) + + vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 + vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 1 ------------- + + vbroadcastsd(mem(4+5*6)*8(rax), ymm0) // ymm0 = alpha45 + vbroadcastsd(mem(4+4*6)*8(rax), ymm1) // ymm1 = (1/alpha44) + + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 + vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 + + vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 + vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 + + vmulpd(ymm12, ymm1, ymm12) // ymm12 *= (1/alpha44) + vmulpd(ymm13, ymm1, ymm13) // ymm13 *= (1/alpha44) + + vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 + vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 2 ------------- + + vbroadcastsd(mem(3+5*6)*8(rax), ymm0) // ymm0 = alpha35 + vbroadcastsd(mem(3+4*6)*8(rax), ymm1) // ymm1 = alpha34 + + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 + vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 + + vbroadcastsd(mem(3+3*6)*8(rax), ymm0) // ymm0 = (1/alpha33) + + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 + vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 + + vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 + vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 + + vmulpd(ymm10, ymm0, ymm10) // ymm10 *= (1/alpha33) + vmulpd(ymm11, ymm0, ymm11) // ymm11 *= (1/alpha33) + + vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 + vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 3 ------------- + + vbroadcastsd(mem(2+5*6)*8(rax), ymm0) // ymm0 = alpha25 + vbroadcastsd(mem(2+4*6)*8(rax), ymm1) // ymm1 = alpha24 + + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 + vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 + + vbroadcastsd(mem(2+3*6)*8(rax), ymm0) // ymm0 = alpha23 + + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 + vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 + + vbroadcastsd(mem(2+2*6)*8(rax), ymm1) // ymm1 = (1/alpha22) + + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 + vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 + + vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 + vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 + + vmulpd(ymm8, ymm1, ymm8) // ymm8 *= (1/alpha33) + vmulpd(ymm9, ymm1, ymm9) // ymm9 *= (1/alpha33) + + vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 + vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 4 ------------- + + vbroadcastsd(mem(1+5*6)*8(rax), ymm0) // ymm0 = alpha15 + vbroadcastsd(mem(1+4*6)*8(rax), ymm1) // ymm1 = alpha14 + + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 + vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 + + vbroadcastsd(mem(1+3*6)*8(rax), ymm0) // ymm0 = alpha13 + + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 + vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 + + vbroadcastsd(mem(1+2*6)*8(rax), ymm1) // ymm1 = alpha12 + + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 + vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 + + vbroadcastsd(mem(1+1*6)*8(rax), ymm0) // ymm4 = (1/alpha11) + + vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 + vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 + + vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 + vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 + + vmulpd(ymm6, ymm0, ymm6) // ymm6 *= (1/alpha44) + vmulpd(ymm7, ymm0, ymm7) // ymm7 *= (1/alpha44) + + vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 + vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + // iteration 5 ------------- + + vbroadcastsd(mem(0+5*6)*8(rax), ymm0) // ymm0 = alpha05 + vbroadcastsd(mem(0+4*6)*8(rax), ymm1) // ymm1 = alpha04 + + vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 + vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 + + vbroadcastsd(mem(0+3*6)*8(rax), ymm0) // ymm0 = alpha03 + + vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 + vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 + + vbroadcastsd(mem(0+2*6)*8(rax), ymm1) // ymm1 = alpha02 + + vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 + vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 + + vbroadcastsd(mem(0+1*6)*8(rax), ymm0) // ymm0 = alpha01 + + vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 + vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 + + vbroadcastsd(mem(0+0*6)*8(rax), ymm1) // ymm1 = (1/alpha00) + + vfmadd231pd(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 + vfmadd231pd(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 + + vsubpd(ymm2, ymm4, ymm4) // ymm4 -= ymm2 + vsubpd(ymm3, ymm5, ymm5) // ymm5 -= ymm3 + + vmulpd(ymm4, ymm1, ymm4) // ymm4 *= (1/alpha00) + vmulpd(ymm5, ymm1, ymm5) // ymm5 *= (1/alpha00) + + vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 + vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 + sub(rdi, rcx) // rcx -= rs_b + sub(rdi, rdx) // rdx -= rs_b + + + + + mov(r8, rcx) // load address of c11 from r8 + mov(r9, rdi) // load rs_c (in bytes) from r9 + mov(r10, rsi) // load cs_c (in bytes) from r10 + + lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; + lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; + + // These are used in the macros below. + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; + //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; + + + + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.DROWSTORED) // jump to row storage case + + + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + + + // if neither row- or column- + // stored, use general case. + label(.DGENSTORED) + + + vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c11 + 4*cs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" + + + mov(rdx, rcx) // rcx = c11 + 4*cs_c + + + vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c11 += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" + add(rdi, rcx) // c11 += rs_c; + + + vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "jmp .DDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORED: \n\t" - " \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovupd %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovupd %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm6, %%ymm4, %%ymm0 \n\t" - "vunpckhpd %%ymm6, %%ymm4, %%ymm1 \n\t" - "vunpcklpd %%ymm10, %%ymm8, %%ymm2 \n\t" - "vunpckhpd %%ymm10, %%ymm8, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm4 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm6 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm8 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm10 \n\t" - " \n\t" - "vmovupd %%ymm4, (%%rcx ) \n\t" - "vmovupd %%ymm6, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm8, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm10, (%%rcx,%%r13 ) \n\t" - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm14, %%ymm12, %%ymm0 \n\t" - "vunpckhpd %%ymm14, %%ymm12, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm3, (%%r14,%%r13 ) \n\t" - " \n\t" - "leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - "vunpcklpd %%ymm7, %%ymm5, %%ymm0 \n\t" - "vunpckhpd %%ymm7, %%ymm5, %%ymm1 \n\t" - "vunpcklpd %%ymm11, %%ymm9, %%ymm2 \n\t" - "vunpckhpd %%ymm11, %%ymm9, %%ymm3 \n\t" - "vinsertf128 $0x1, %%xmm2, %%ymm0, %%ymm5 \n\t" - "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm7 \n\t" - "vperm2f128 $0x31, %%ymm2, %%ymm0, %%ymm9 \n\t" - "vperm2f128 $0x31, %%ymm3, %%ymm1, %%ymm11 \n\t" - " \n\t" - "vmovupd %%ymm5, (%%rcx ) \n\t" - "vmovupd %%ymm7, (%%rcx,%%rsi ) \n\t" - "vmovupd %%ymm9, (%%rcx,%%rsi,2) \n\t" - "vmovupd %%ymm11, (%%rcx,%%r13 ) \n\t" - " \n\t" - //"leaq (%%rcx,%%rsi,4), %%rcx \n\t" - " \n\t" - "vunpcklpd %%ymm15, %%ymm13, %%ymm0 \n\t" - "vunpckhpd %%ymm15, %%ymm13, %%ymm1 \n\t" - "vextractf128 $0x1, %%ymm0, %%xmm2 \n\t" - "vextractf128 $0x1, %%ymm1, %%xmm3 \n\t" - " \n\t" - "vmovupd %%xmm0, (%%r14 ) \n\t" - "vmovupd %%xmm1, (%%r14,%%rsi ) \n\t" - "vmovupd %%xmm2, (%%r14,%%rsi,2) \n\t" - "vmovupd %%xmm3, (%%r14,%%r13 ) \n\t" - " \n\t" - //"leaq (%%r14,%%rsi,4), %%r14 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - "vzeroupper \n\t" - " \n\t" + + + jmp(.DDONE) + + + + label(.DROWSTORED) + + + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm5, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm6, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm7, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm8, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm9, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm10, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm11, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm12, mem(rcx)) + add(rdi, rcx) + vmovupd(ymm13, mem(rdx)) + add(rdi, rdx) + + vmovupd(ymm14, mem(rcx)) + //add(rdi, rcx) + vmovupd(ymm15, mem(rdx)) + //add(rdi, rdx) + + + jmp(.DDONE) + + + + label(.DCOLSTORED) + + + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) + + lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm3, mem(r14, r13, 1)) + + lea(mem(r14, rsi, 4), r14) + + + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) + + //lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm3) + + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm3, mem(r14, r13, 1)) + + //lea(mem(r14, rsi, 4), r14) + + + + + + label(.DDONE) + + vzeroupper() + : // output operands (none) @@ -1520,3 +1523,4 @@ void bli_dgemmtrsm_u_zen_asm_6x8 } +