diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 5216d6ff8..4fed2a25a 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -64,12 +64,14 @@ #endif #if 0 -#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4 -#define BLIS_DEFAULT_MC_S 264 -#define BLIS_DEFAULT_KC_S 128 +#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 +#define BLIS_DEFAULT_MC_S 256 +#define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 -#define BLIS_DEFAULT_MR_S 24 -#define BLIS_DEFAULT_NR_S 4 +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 24 + +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 @@ -95,12 +97,14 @@ #endif #if 0 -#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4 -#define BLIS_DEFAULT_MC_D 96 -#define BLIS_DEFAULT_KC_D 192 +#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 +#define BLIS_DEFAULT_MC_D 152 +#define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 -#define BLIS_DEFAULT_MR_D 12 -#define BLIS_DEFAULT_NR_D 4 +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 12 + +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index eda248b6b..42bda8a51 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -85,7 +85,7 @@ void bli_trsm_front bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } -#if 0 +#if 1 // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index e66851194..af0ad728d 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -101,6 +101,14 @@ // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- +// Do NOT require the cross-blocksize constraints. That is, do not enforce +// MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY +// needed when implementing trsm_r by allowing the right-hand matrix B to +// be triangular. +#ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS +#endif + // Stay initialized after auto-initialization, unless and until the user // explicitly calls bli_finalize(). #ifdef BLIS_DISABLE_STAY_AUTO_INITIALIZED diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c index a1d2fc940..c92612b07 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c @@ -93,11 +93,10 @@ void bli_sgemm_asm_4x24 "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" - "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm1 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm2 \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm3 \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c @@ -122,92 +121,110 @@ void bli_sgemm_asm_4x24 " \n\t" " \n\t" " \n\t" // iteration 0 - "prefetcht0 24 * 4(%%rax) \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovaps 4 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovaps 5 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" " \n\t" - "vbroadcastf128 0 * 4(%%rax), %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm3 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastf128 4 * 4(%%rax), %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovaps 4 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastss 6 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovaps 6 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovaps 7 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovaps 8 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" + " \n\t" " \n\t" " \n\t" // iteration 2 - //"prefetcht0 32 * 4(%%rax) \n\t" + "prefetcht0 22 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastss 8 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastss 10 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovaps 9 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovaps 10 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovaps 11 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" " \n\t" - "vbroadcastf128 8 * 4(%%rax), %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovaps 5 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovaps 6 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovaps 7 * 32(%%rbx), %%ymm3 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastf128 12 * 4(%%rax), %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovaps 8 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovaps 9 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovaps 10 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastss 12 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastss 14 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovaps 12 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovaps 13 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovaps 14 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" @@ -233,27 +250,30 @@ void bli_sgemm_asm_4x24 " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" - "prefetcht0 24 * 4(%%rax) \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" - "vbroadcastf128 0 * 4(%%rax), %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" - "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovaps 4 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovaps 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" @@ -291,69 +311,6 @@ void bli_sgemm_asm_4x24 " \n\t" " \n\t" " \n\t" - " \n\t" // ymm4 : ( ab00 ab11 ab22 ab33 ... ) - " \n\t" // ymm7 : ( ab30 ab01 ab12 ab23 ... ) - " \n\t" // ymm10: ( ab20 ab31 ab02 ab13 ... ) - " \n\t" // ymm13: ( ab10 ab21 ab32 ab03 ... ) - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" - "vmovaps %%ymm7, %%ymm1 \n\t" - "vmovaps %%ymm10, %%ymm2 \n\t" - "vshufps $0xd8, %%ymm7, %%ymm4, %%ymm4 \n\t" - "vshufps $0x72, %%ymm13, %%ymm10, %%ymm7 \n\t" - "vshufps $0x8d, %%ymm13, %%ymm0, %%ymm10 \n\t" - "vshufps $0x27, %%ymm1, %%ymm2, %%ymm13 \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" - "vmovaps %%ymm8, %%ymm1 \n\t" - "vmovaps %%ymm11, %%ymm2 \n\t" - "vshufps $0xd8, %%ymm8, %%ymm5, %%ymm5 \n\t" - "vshufps $0x72, %%ymm14, %%ymm11, %%ymm8 \n\t" - "vshufps $0x8d, %%ymm14, %%ymm0, %%ymm11 \n\t" - "vshufps $0x27, %%ymm1, %%ymm2, %%ymm14 \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" - "vmovaps %%ymm9, %%ymm1 \n\t" - "vmovaps %%ymm12, %%ymm2 \n\t" - "vshufps $0xd8, %%ymm9, %%ymm6, %%ymm6 \n\t" - "vshufps $0x72, %%ymm15, %%ymm12, %%ymm9 \n\t" - "vshufps $0x8d, %%ymm15, %%ymm0, %%ymm12 \n\t" - "vshufps $0x27, %%ymm1, %%ymm2, %%ymm15 \n\t" - " \n\t" - " \n\t" // ymm4 : ( ab00 ab22 ab01 ab23 ... ) - " \n\t" // ymm7 : ( ab02 ab20 ab03 ab21 ... ) - " \n\t" // ymm10: ( ab11 ab33 ab10 ab32 ... ) - " \n\t" // ymm13: ( ab13 ab31 ab12 ab30 ... ) - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" - "vmovaps %%ymm7, %%ymm1 \n\t" - "vmovaps %%ymm10, %%ymm2 \n\t" - "vshufps $0x88, %%ymm7, %%ymm4, %%ymm4 \n\t" - "vshufps $0x22, %%ymm13, %%ymm10, %%ymm7 \n\t" - "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm10 \n\t" - "vshufps $0x77, %%ymm2, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" - "vmovaps %%ymm8, %%ymm1 \n\t" - "vmovaps %%ymm11, %%ymm2 \n\t" - "vshufps $0x88, %%ymm8, %%ymm5, %%ymm5 \n\t" - "vshufps $0x22, %%ymm14, %%ymm11, %%ymm8 \n\t" - "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm11 \n\t" - "vshufps $0x77, %%ymm2, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" - "vmovaps %%ymm9, %%ymm1 \n\t" - "vmovaps %%ymm12, %%ymm2 \n\t" - "vshufps $0x88, %%ymm9, %%ymm6, %%ymm6 \n\t" - "vshufps $0x22, %%ymm15, %%ymm12, %%ymm9 \n\t" - "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm12 \n\t" - "vshufps $0x77, %%ymm2, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ... ) - " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ... ) - " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ... ) - " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ... ) - " \n\t" " \n\t" " \n\t" " \n\t" @@ -368,23 +325,6 @@ void bli_sgemm_asm_4x24 "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*rs_c % 32 == 0, AND - " \n\t" // cs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // row-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*rs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -392,10 +332,8 @@ void bli_sgemm_asm_4x24 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SROWSTORED \n\t" // jump to row storage case + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -490,74 +428,59 @@ void bli_sgemm_asm_4x24 ".SROWSTORED: \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps (%%r12), %%ymm2 \n\t" - "vfmadd213ps %%ymm6, %%ymm3, %%ymm2 \n\t" - "vmovaps %%ymm2, (%%r12) \n\t" + "vfmadd231ps (%%r12), %%ymm3, %%ymm6 \n\t" + "vmovups %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm7 \n\t" + "vmovups %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm8, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm8 \n\t" + "vmovups %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps (%%r12), %%ymm2 \n\t" - "vfmadd213ps %%ymm9, %%ymm3, %%ymm2 \n\t" - "vmovaps %%ymm2, (%%r12) \n\t" + "vfmadd231ps (%%r12), %%ymm3, %%ymm9 \n\t" + "vmovups %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps (%%r12), %%ymm2 \n\t" - "vfmadd213ps %%ymm12, %%ymm3, %%ymm2 \n\t" - "vmovaps %%ymm2, (%%r12) \n\t" + "vfmadd231ps (%%r12), %%ymm3, %%ymm12 \n\t" + "vmovups %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm13 \n\t" + "vmovups %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm14, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm14 \n\t" + "vmovups %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" - "vmovaps (%%r12), %%ymm2 \n\t" - "vfmadd213ps %%ymm15, %%ymm3, %%ymm2 \n\t" - "vmovaps %%ymm2, (%%r12) \n\t" + "vfmadd231ps (%%r12), %%ymm3, %%ymm15 \n\t" + "vmovups %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" - " \n\t" - " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SROWSTORBZ \n\t" // jump to row storage case + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -642,32 +565,32 @@ void bli_sgemm_asm_4x24 ".SROWSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm5, (%%rdx) \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps %%ymm6, (%%r12) \n\t" + "vmovups %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovaps %%ymm7, (%%rcx) \n\t" + "vmovups %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm8, (%%rdx) \n\t" + "vmovups %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps %%ymm9, (%%r12) \n\t" + "vmovups %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm11, (%%rdx) \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovaps %%ymm12, (%%r12) \n\t" + "vmovups %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" + "vmovups %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm14, (%%rdx) \n\t" + "vmovups %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" - "vmovaps %%ymm15, (%%r12) \n\t" + "vmovups %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" @@ -703,8 +626,6 @@ void bli_sgemm_asm_4x24 ); } - - #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ @@ -757,11 +678,10 @@ void bli_dgemm_asm_4x12 "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" - "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm1 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm2 \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm3 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" + "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c @@ -786,92 +706,110 @@ void bli_dgemm_asm_4x12 " \n\t" " \n\t" " \n\t" // iteration 0 - "prefetcht0 24 * 8(%%rax) \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 3 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 4 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 5 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" " \n\t" - "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastf128 4 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 6 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 4 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 6 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 6 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 7 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 8 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" + " \n\t" " \n\t" - "prefetcht0 32 * 8(%%rax) \n\t" " \n\t" // iteration 2 - "vbroadcastf128 8 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 10 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 5 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 6 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 7 * 32(%%rbx), %%ymm3 \n\t" + "prefetcht0 22 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 9 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 10 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 11 * 32(%%rbx), %%ymm2 \n\t" + " \n\t" + " \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastf128 12 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 14 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 8 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 9 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 10 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 12 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 13 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 14 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" @@ -897,27 +835,30 @@ void bli_dgemm_asm_4x12 " \n\t" ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" - "prefetcht0 24 * 8(%%rax) \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" - "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 3 * 32(%%rbx), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 4 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" @@ -953,65 +894,11 @@ void bli_dgemm_asm_4x12 "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" - " \n\t" // ymm4 : ( ab00 ab11 ab02 ab13 ) - " \n\t" // ymm7 : ( ab10 ab01 ab12 ab03 ) - " \n\t" // ymm10: ( ab20 ab31 ab22 ab33 ) - " \n\t" // ymm13: ( ab30 ab21 ab32 ab23 ) - " \n\t" - " \n\t" // ymm5 : ( ab04 ab15 ab06 ab17 ) - " \n\t" // ymm8 : ( ab14 ab05 ab16 ab07 ) - " \n\t" // ymm11: ( ab24 ab35 ab26 ab37 ) - " \n\t" // ymm14: ( ab34 ab25 ab36 ab27 ) - " \n\t" - " \n\t" // ymm6 : ( ab08 ab19 ab0A ab1B ) - " \n\t" // ymm9 : ( ab18 ab09 ab1A ab0B ) - " \n\t" // ymm12: ( ab28 ab39 ab2A ab3B ) - " \n\t" // ymm15: ( ab38 ab29 ab3A ab2B ) - "vmovapd %%ymm4, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm7, %%ymm4, %%ymm4 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm7, %%ymm7 \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm8, %%ymm5, %%ymm5 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm9, %%ymm6, %%ymm6 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm13, %%ymm10, %%ymm10 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm14, %%ymm11, %%ymm11 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm15, %%ymm12, %%ymm12 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ) - " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ) - " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ) - " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ) - " \n\t" - " \n\t" // ymm5 : ( ab04 ab05 ab06 ab07 ) - " \n\t" // ymm8 : ( ab14 ab15 ab16 ab17 ) - " \n\t" // ymm11: ( ab24 ab25 ab26 ab27 ) - " \n\t" // ymm14: ( ab34 ab35 ab36 ab37 ) - " \n\t" - " \n\t" // ymm6 : ( ab08 ab09 ab0A ab0B ) - " \n\t" // ymm9 : ( ab18 ab19 ab1A ab1B ) - " \n\t" // ymm12: ( ab28 ab29 ab2A ab2B ) - " \n\t" // ymm15: ( ab38 ab39 ab3A ab3B ) " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" - " \n\t" - //"m" (rs_c), // 7 rdi - //"m" (cs_c), // 8 rsi "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) " \n\t" @@ -1019,25 +906,11 @@ void bli_dgemm_asm_4x12 "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*cs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; + //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*rs_c % 32 == 0, AND - " \n\t" // cs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // row-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -1045,10 +918,8 @@ void bli_dgemm_asm_4x12 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORED \n\t" // jump to row storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -1143,74 +1014,59 @@ void bli_dgemm_asm_4x12 ".DROWSTORED: \n\t" " \n\t" " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" + "vfmadd231pd (%%r12), %%ymm3, %%ymm6 \n\t" + "vmovups %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm7 \n\t" + "vmovups %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm8 \n\t" + "vmovups %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" + "vfmadd231pd (%%r12), %%ymm3, %%ymm9 \n\t" + "vmovups %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" + "vfmadd231pd (%%r12), %%ymm3, %%ymm12 \n\t" + "vmovups %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm13 \n\t" + "vmovups %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm14 \n\t" + "vmovups %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" + "vfmadd231pd (%%r12), %%ymm3, %%ymm15 \n\t" + "vmovups %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" - " \n\t" - " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORBZ \n\t" // jump to row storage case + " \n\t" // check if aligned/column-stored + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -1295,32 +1151,32 @@ void bli_dgemm_asm_4x12 ".DROWSTORBZ: \n\t" " \n\t" " \n\t" - "vmovapd %%ymm4, (%%rcx) \n\t" + "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm5, (%%rdx) \n\t" + "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm6, (%%r12) \n\t" + "vmovupd %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovapd %%ymm7, (%%rcx) \n\t" + "vmovupd %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" + "vmovupd %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm9, (%%r12) \n\t" + "vmovupd %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovapd %%ymm10, (%%rcx) \n\t" + "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm11, (%%rdx) \n\t" + "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm12, (%%r12) \n\t" + "vmovupd %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" + "vmovupd %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" + "vmovupd %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm15, (%%r12) \n\t" + "vmovupd %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" diff --git a/kernels/x86_64/haswell/3/old/bli_gemm_asm_d4x12.c b/kernels/x86_64/haswell/3/old/bli_gemm_asm_d4x12.c new file mode 100644 index 000000000..a1d2fc940 --- /dev/null +++ b/kernels/x86_64/haswell/3/old/bli_gemm_asm_d4x12.c @@ -0,0 +1,1403 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#define SGEMM_INPUT_GS_BETA_NZ \ + "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + +#define SGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ + "vmovss %%xmm0, (%%rcx ) \n\t" \ + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ + "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ + "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ + "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ + "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ + "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ + "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ + "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ + "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ + "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" + +void bli_sgemm_asm_4x24 + ( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + dim_t k_iter = k / 4; + dim_t k_left = k % 4; + + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm1 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm2 \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,4), %%rdi \n\t" // rs_c *= sizeof(float) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; + "prefetcht0 7 * 4(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 4(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c + "prefetcht0 7 * 4(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".SLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 24 * 4(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 4(%%rax), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastf128 4 * 4(%%rax), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovaps 4 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 2 + //"prefetcht0 32 * 4(%%rax) \n\t" + " \n\t" + "vbroadcastf128 8 * 4(%%rax), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovaps 5 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovaps 6 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovaps 7 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastf128 12 * 4(%%rax), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovaps 8 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovaps 9 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovaps 10 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $4 * 4 * 4, %%rax \n\t" // a += 4*4 (unroll x mr) + "addq $4 * 24 * 4, %%rbx \n\t" // b += 4*24 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".SLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 24 * 4(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 4(%%rax), %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm9 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilps $0x93, %%ymm0, %%ymm0 \n\t" + "vfmadd231ps %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231ps %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231ps %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $1 * 4 * 4, %%rax \n\t" // a += 1*4 (unroll x mr) + "addq $1 * 24 * 4, %%rbx \n\t" // b += 1*24 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".SPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // ymm4 : ( ab00 ab11 ab22 ab33 ... ) + " \n\t" // ymm7 : ( ab30 ab01 ab12 ab23 ... ) + " \n\t" // ymm10: ( ab20 ab31 ab02 ab13 ... ) + " \n\t" // ymm13: ( ab10 ab21 ab32 ab03 ... ) + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + "vmovaps %%ymm7, %%ymm1 \n\t" + "vmovaps %%ymm10, %%ymm2 \n\t" + "vshufps $0xd8, %%ymm7, %%ymm4, %%ymm4 \n\t" + "vshufps $0x72, %%ymm13, %%ymm10, %%ymm7 \n\t" + "vshufps $0x8d, %%ymm13, %%ymm0, %%ymm10 \n\t" + "vshufps $0x27, %%ymm1, %%ymm2, %%ymm13 \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + "vmovaps %%ymm8, %%ymm1 \n\t" + "vmovaps %%ymm11, %%ymm2 \n\t" + "vshufps $0xd8, %%ymm8, %%ymm5, %%ymm5 \n\t" + "vshufps $0x72, %%ymm14, %%ymm11, %%ymm8 \n\t" + "vshufps $0x8d, %%ymm14, %%ymm0, %%ymm11 \n\t" + "vshufps $0x27, %%ymm1, %%ymm2, %%ymm14 \n\t" + " \n\t" + "vmovaps %%ymm6, %%ymm0 \n\t" + "vmovaps %%ymm9, %%ymm1 \n\t" + "vmovaps %%ymm12, %%ymm2 \n\t" + "vshufps $0xd8, %%ymm9, %%ymm6, %%ymm6 \n\t" + "vshufps $0x72, %%ymm15, %%ymm12, %%ymm9 \n\t" + "vshufps $0x8d, %%ymm15, %%ymm0, %%ymm12 \n\t" + "vshufps $0x27, %%ymm1, %%ymm2, %%ymm15 \n\t" + " \n\t" + " \n\t" // ymm4 : ( ab00 ab22 ab01 ab23 ... ) + " \n\t" // ymm7 : ( ab02 ab20 ab03 ab21 ... ) + " \n\t" // ymm10: ( ab11 ab33 ab10 ab32 ... ) + " \n\t" // ymm13: ( ab13 ab31 ab12 ab30 ... ) + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + "vmovaps %%ymm7, %%ymm1 \n\t" + "vmovaps %%ymm10, %%ymm2 \n\t" + "vshufps $0x88, %%ymm7, %%ymm4, %%ymm4 \n\t" + "vshufps $0x22, %%ymm13, %%ymm10, %%ymm7 \n\t" + "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm10 \n\t" + "vshufps $0x77, %%ymm2, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + "vmovaps %%ymm8, %%ymm1 \n\t" + "vmovaps %%ymm11, %%ymm2 \n\t" + "vshufps $0x88, %%ymm8, %%ymm5, %%ymm5 \n\t" + "vshufps $0x22, %%ymm14, %%ymm11, %%ymm8 \n\t" + "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm11 \n\t" + "vshufps $0x77, %%ymm2, %%ymm14, %%ymm14 \n\t" + " \n\t" + "vmovaps %%ymm6, %%ymm0 \n\t" + "vmovaps %%ymm9, %%ymm1 \n\t" + "vmovaps %%ymm12, %%ymm2 \n\t" + "vshufps $0x88, %%ymm9, %%ymm6, %%ymm6 \n\t" + "vshufps $0x22, %%ymm15, %%ymm12, %%ymm9 \n\t" + "vshufps $0xdd, %%ymm0, %%ymm1, %%ymm12 \n\t" + "vshufps $0x77, %%ymm2, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ... ) + " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ... ) + " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ... ) + " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ... ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,4), %%rsi \n\t" // rsi = cs_c * sizeof(float) + " \n\t" + "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // rdx = c + 8*cs_c; + "leaq (%%rdx,%%rsi,8), %%r12 \n\t" // r12 = c + 16*cs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; + "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*rs_c % 32 == 0, AND + " \n\t" // cs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // row-stored + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (4*rs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .SROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".SGENSTORED: \n\t" + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 16*cs_c + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + SGEMM_INPUT_GS_BETA_NZ + "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SROWSTORED: \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps (%%r12), %%ymm2 \n\t" + "vfmadd213ps %%ymm6, %%ymm3, %%ymm2 \n\t" + "vmovaps %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213ps %%ymm8, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps (%%r12), %%ymm2 \n\t" + "vfmadd213ps %%ymm9, %%ymm3, %%ymm2 \n\t" + "vmovaps %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps (%%r12), %%ymm2 \n\t" + "vfmadd213ps %%ymm12, %%ymm3, %%ymm2 \n\t" + "vmovaps %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213ps %%ymm14, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovaps (%%r12), %%ymm2 \n\t" + "vfmadd213ps %%ymm15, %%ymm3, %%ymm2 \n\t" + "vmovaps %%ymm2, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SBETAZERO: \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .SROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".SGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm7, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm10, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm11, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 16*cs_c + " \n\t" + " \n\t" + "vmovaps %%ymm6, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm15, %%ymm0 \n\t" + SGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps %%ymm6, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovaps %%ymm7, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm8, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps %%ymm9, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovaps %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovaps %%ymm12, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovaps %%ymm13, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm14, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovaps %%ymm15, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); +} + + + +#define DGEMM_INPUT_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ + +#define DGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ + "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ + "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ + +void bli_dgemm_asm_4x12 + ( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + dim_t k_iter = k / 4; + dim_t k_left = k % 4; + + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovapd -4 * 32(%%rbx), %%ymm1 \n\t" + "vmovapd -3 * 32(%%rbx), %%ymm2 \n\t" + "vmovapd -2 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c + "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 24 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastf128 4 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 6 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 2 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 4 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" // iteration 2 + "vbroadcastf128 8 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 10 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 5 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 6 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 7 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastf128 12 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 14 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 8 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 9 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 10 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) + "addq $4 * 12 * 8, %%rbx \n\t" // b += 4*12 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 24 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $1 * 4 * 8, %%rax \n\t" // a += 1*4 (unroll x mr) + "addq $1 * 12 * 8, %%rbx \n\t" // b += 1*12 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" // ymm4 : ( ab00 ab11 ab02 ab13 ) + " \n\t" // ymm7 : ( ab10 ab01 ab12 ab03 ) + " \n\t" // ymm10: ( ab20 ab31 ab22 ab33 ) + " \n\t" // ymm13: ( ab30 ab21 ab32 ab23 ) + " \n\t" + " \n\t" // ymm5 : ( ab04 ab15 ab06 ab17 ) + " \n\t" // ymm8 : ( ab14 ab05 ab16 ab07 ) + " \n\t" // ymm11: ( ab24 ab35 ab26 ab37 ) + " \n\t" // ymm14: ( ab34 ab25 ab36 ab27 ) + " \n\t" + " \n\t" // ymm6 : ( ab08 ab19 ab0A ab1B ) + " \n\t" // ymm9 : ( ab18 ab09 ab1A ab0B ) + " \n\t" // ymm12: ( ab28 ab39 ab2A ab3B ) + " \n\t" // ymm15: ( ab38 ab29 ab3A ab2B ) + "vmovapd %%ymm4, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm7, %%ymm4, %%ymm4 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm7, %%ymm7 \n\t" + " \n\t" + "vmovapd %%ymm5, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm8, %%ymm5, %%ymm5 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vmovapd %%ymm6, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm9, %%ymm6, %%ymm6 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm13, %%ymm10, %%ymm10 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm14, %%ymm11, %%ymm11 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm14, %%ymm14 \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm15, %%ymm12, %%ymm12 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ) + " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ) + " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ) + " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ) + " \n\t" + " \n\t" // ymm5 : ( ab04 ab05 ab06 ab07 ) + " \n\t" // ymm8 : ( ab14 ab15 ab16 ab17 ) + " \n\t" // ymm11: ( ab24 ab25 ab26 ab27 ) + " \n\t" // ymm14: ( ab34 ab35 ab36 ab37 ) + " \n\t" + " \n\t" // ymm6 : ( ab08 ab09 ab0A ab0B ) + " \n\t" // ymm9 : ( ab18 ab19 ab1A ab1B ) + " \n\t" // ymm12: ( ab28 ab29 ab2A ab2B ) + " \n\t" // ymm15: ( ab38 ab39 ab3A ab3B ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + //"m" (rs_c), // 7 rdi + //"m" (cs_c), // 8 rsi + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*cs_c; + "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*cs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*rs_c % 32 == 0, AND + " \n\t" // cs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // row-stored + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORED: \n\t" + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORED: \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DBETAZERO: \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm7, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm13, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + "vmovapd %%ymm5, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm8, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm14, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + "vmovapd %%ymm6, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm9, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm15, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm6, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm7, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm8, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm9, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm12, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm13, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm14, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm15, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); +} + +#if 0 + +void bli_cgemm_asm_ + ( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + //dim_t k_iter = k / 4; + //dim_t k_left = k % 4; + +} + + + +void bli_zgemm_asm_ + ( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + //void* a_next = bli_auxinfo_next_a( data ); + //void* b_next = bli_auxinfo_next_b( data ); + + //dim_t k_iter = k / 4; + //dim_t k_left = k % 4; + +} + +#endif