From 627d59b5ba06866b26f46e4434a0435b600925e3 Mon Sep 17 00:00:00 2001 From: Etienne Sauvage Date: Mon, 29 Feb 2016 21:53:12 +0100 Subject: [PATCH 01/10] symbolic link for bulldozer configuration to kernels --- config/bulldozer/kernels | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/bulldozer/kernels b/config/bulldozer/kernels index 71a8d8d34..9ed7ea19a 120000 --- a/config/bulldozer/kernels +++ b/config/bulldozer/kernels @@ -1 +1 @@ -../../kernels/x86_64/bulldozer \ No newline at end of file +../../kernels/x86_64/bulldozer/ \ No newline at end of file From 4ca5d5b1fd6f2e4a8b2e139c5405475239581e51 Mon Sep 17 00:00:00 2001 From: Etienne Sauvage Date: Tue, 1 Mar 2016 21:33:01 +0100 Subject: [PATCH 02/10] sgemm micro-kernel for FMA4 instruction set (bulldozer configuration), based on x86_64/avx micro-kernel --- config/bulldozer/bli_kernel.h | 9 +- .../x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c | 927 ++++++++++++++++++ 2 files changed, 932 insertions(+), 4 deletions(-) diff --git a/config/bulldozer/bli_kernel.h b/config/bulldozer/bli_kernel.h index 388c6a1b4..c2b1e313a 100644 --- a/config/bulldozer/bli_kernel.h +++ b/config/bulldozer/bli_kernel.h @@ -51,9 +51,9 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#define BLIS_DEFAULT_MC_S 256 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 8192 +#define BLIS_DEFAULT_MC_S 128 +#define BLIS_DEFAULT_KC_S 384 +#define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 @@ -70,7 +70,7 @@ // -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 -#define BLIS_DEFAULT_NR_S 4 +#define BLIS_DEFAULT_NR_S 8 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 @@ -149,6 +149,7 @@ // -- gemm -- +#define BLIS_SGEMM_UKERNEL bli_sgemm_8x8_FMA4 #define BLIS_DGEMM_UKERNEL bli_dgemm_4x6_FMA4 // -- trsm-related -- diff --git a/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c b/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c index c140d7ced..8fc716a9e 100644 --- a/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c +++ b/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c @@ -34,6 +34,933 @@ #include "blis.h" +void bli_sgemm_8x8_FMA4( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + dim_t k_iter = k / 4; + dim_t k_left = k % 4; + + __asm__ volatile + ( + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + " \n\t" + "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading + "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %8, %%rdi \n\t" // load cs_c + "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) + "leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c; + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c + "prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c + "prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c + "prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c + "prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c + "prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c + " \n\t" + "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + ".SLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 16 * 32(%%rax) \n\t" + "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" + "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9 \n\t" + " \n\t" + "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" + "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" + " \n\t" + " \n\t" // iteration 1 + "vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t" + "vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t" + " \n\t" + "vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t" + "vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 18 * 32(%%rax) \n\t" + "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" + "addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr) + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" + "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t" + " \n\t" + "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" + "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" + " \n\t" + " \n\t" // iteration 3 + "vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t" + "addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr) + "vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t" + "vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t" + " \n\t" + "vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t" + "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t" + "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t" + "vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t" + " \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + ".SLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" + "vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + " \n\t" + "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" + "addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr) + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t" + "vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t" + " \n\t" + "vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t" + "addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr) + "vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t" + "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t" + "vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t" + "vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t" + "vmovaps %%ymm1, %%ymm0 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + ".SPOSTACCUM: \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 + " \n\t" // ab10 ab12 ab14 ab16 + " \n\t" // ab22 ab20 ab26 ab24 + " \n\t" // ab32 ab30 ab36 ab34 + " \n\t" // ab44 ab46 ab40 ab42 + " \n\t" // ab54 ab56 ab50 ab52 + " \n\t" // ab66 ab64 ab62 ab60 + " \n\t" // ab76 ) ab74 ) ab72 ) ab70 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 + " \n\t" // ab11 ab13 ab15 ab17 + " \n\t" // ab23 ab21 ab27 ab25 + " \n\t" // ab33 ab31 ab37 ab35 + " \n\t" // ab45 ab47 ab41 ab43 + " \n\t" // ab55 ab57 ab51 ab53 + " \n\t" // ab67 ab65 ab63 ab61 + " \n\t" // ab77 ) ab75 ) ab73 ) ab71 ) + "vmovaps %%ymm15, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmovaps %%ymm11, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vmovaps %%ymm10, %%ymm7 \n\t" + "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t" + "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 + " \n\t" // ab10 ab12 ab14 ab16 + " \n\t" // ab20 ab22 ab24 ab26 + " \n\t" // ab30 ab32 ab34 ab36 + " \n\t" // ab44 ab46 ab40 ab42 + " \n\t" // ab54 ab56 ab50 ab52 + " \n\t" // ab64 ab66 ab60 ab62 + " \n\t" // ab74 ) ab76 ) ab70 ) ab72 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 + " \n\t" // ab11 ab13 ab15 ab17 + " \n\t" // ab21 ab23 ab25 ab27 + " \n\t" // ab31 ab33 ab35 ab37 + " \n\t" // ab45 ab47 ab41 ab43 + " \n\t" // ab55 ab57 ab51 ab53 + " \n\t" // ab65 ab67 ab61 ab63 + " \n\t" // ab75 ) ab77 ) ab71 ) ab73 ) + "vmovaps %%ymm15, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t" + "vperm2f128 $0x12, %%ymm11, %%ymm7, %%ymm11 \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm9, %%ymm13, %%ymm13 \n\t" + "vperm2f128 $0x12, %%ymm9, %%ymm7, %%ymm9 \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t" + "vperm2f128 $0x12, %%ymm10, %%ymm7, %%ymm10 \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm8, %%ymm12, %%ymm12 \n\t" + "vperm2f128 $0x12, %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" // ymm15: ymm13: ymm11: ymm9: + " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06 + " \n\t" // ab10 ab12 ab14 ab16 + " \n\t" // ab20 ab22 ab24 ab26 + " \n\t" // ab30 ab32 ab34 ab36 + " \n\t" // ab40 ab42 ab44 ab46 + " \n\t" // ab50 ab52 ab54 ab56 + " \n\t" // ab60 ab62 ab64 ab66 + " \n\t" // ab70 ) ab72 ) ab74 ) ab76 ) + " \n\t" + " \n\t" // ymm14: ymm12: ymm10: ymm8: + " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07 + " \n\t" // ab11 ab13 ab15 ab17 + " \n\t" // ab21 ab23 ab25 ab27 + " \n\t" // ab31 ab33 ab35 ab37 + " \n\t" // ab41 ab43 ab45 ab47 + " \n\t" // ab51 ab53 ab55 ab57 + " \n\t" // ab61 ab63 ab65 ab67 + " \n\t" // ab71 ) ab73 ) ab75 ) ab77 ) + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate + " \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %7, %%rsi \n\t" // load rs_c + "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; + " \n\t" + "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c; + "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 4*cs_c % 32 == 0, AND + " \n\t" // rs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // column-stored + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0. + "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .SCOLSTORED \n\t" // jump to column storage case + " \n\t" + " \n\t" + ".SGENSTORED: \n\t" + " \n\t" // update c00:c70 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vfmaddps %%ymm15, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" // update c01:c71 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c02:c72 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c03:c73 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c04:c74 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c05:c75 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c06:c76 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c07:c77 + "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t" + "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t" + "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t" + "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" + "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t" + "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t" + "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t" + "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t" + "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t" + "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" + " \n\t" + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, + " \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SCOLSTORED: \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70, + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, + "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71, + "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, + "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result, + "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72, + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, + "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73, + "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, + "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result, + "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74, + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, + "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75, + "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, + "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result, + "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76, + "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, + "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77, + "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, + "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result, + "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SBETAZERO: \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .SCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".SGENSTORBZ: \n\t" + " \n\t" + " \n\t" // update c00:c70 + "vmovapd %%ymm15, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c01:c71 + "vmovapd %%ymm14, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c02:c72 + "vmovapd %%ymm13, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c03:c73 + "vmovapd %%ymm12, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c04:c74 + "vmovapd %%ymm11, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c05:c75 + "vmovapd %%ymm10, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c06:c76 + "vmovapd %%ymm9, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + "addq %%rdi, %%rcx \n\t" // c += cs_c; + "addq %%rdi, %%rdx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" // update c07:c77 + "vmovapd %%ymm8, %%ymm0 \n\t" + "vextractf128 $1, %%ymm0, %%xmm2 \n\t" + "vmovss %%xmm0, (%%rcx) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" + "vmovss %%xmm0, (%%rcx,%%r12) \n\t" + "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" + "vmovss %%xmm1, (%%rcx,%%r13) \n\t" + "vmovss %%xmm2, (%%rdx) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%rsi) \n\t" + "vpermilps $0x39, %%xmm3, %%xmm2 \n\t" + "vmovss %%xmm2, (%%rdx,%%r12) \n\t" + "vpermilps $0x39, %%xmm2, %%xmm3 \n\t" + "vmovss %%xmm3, (%%rdx,%%r13) \n\t" + " \n\t" + " \n\t" + "jmp .SDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".SCOLSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory. + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + "vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".SDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); +} + #undef KERNEL4x6_1 #undef KERNEL4x6_2 #undef KERNEL4x6_3 From af92773f4f85a2441fe0c6e3a52c31b07253d08e Mon Sep 17 00:00:00 2001 From: figual Date: Wed, 23 Mar 2016 22:07:02 +0100 Subject: [PATCH 03/10] Updated and improved ARMv8 micro-kernels. --- config/armv8a/bli_kernel.h | 26 +- kernels/armv8a/neon/3/bli_gemm_opt_4x4.c | 2109 +++++++++++++++++----- 2 files changed, 1701 insertions(+), 434 deletions(-) diff --git a/config/armv8a/bli_kernel.h b/config/armv8a/bli_kernel.h index 3bd7da722..38eaef60d 100644 --- a/config/armv8a/bli_kernel.h +++ b/config/armv8a/bli_kernel.h @@ -51,13 +51,13 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#define BLIS_DEFAULT_MC_S 336 -#define BLIS_DEFAULT_KC_S 336 -#define BLIS_DEFAULT_NC_S 4096 +#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 +#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 +#define BLIS_DEFAULT_NC_S 3072 -#define BLIS_DEFAULT_MC_D 160 -#define BLIS_DEFAULT_KC_D 304 -#define BLIS_DEFAULT_NC_D 4096 +#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 +#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 +#define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 @@ -69,11 +69,11 @@ // -- Register blocksizes -- -#define BLIS_DEFAULT_MR_S 4 -#define BLIS_DEFAULT_NR_S 4 +#define BLIS_DEFAULT_MR_S 8 +#define BLIS_DEFAULT_NR_S 12 -#define BLIS_DEFAULT_MR_D 4 -#define BLIS_DEFAULT_NR_D 4 +#define BLIS_DEFAULT_MR_D 6 +#define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 @@ -132,6 +132,8 @@ //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) + + // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- @@ -146,8 +148,8 @@ // -- gemm -- -#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 -#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 // -- trsm-related -- diff --git a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c index 2a54fe825..e010d188f 100644 --- a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c @@ -36,9 +36,21 @@ #include "blis.h" /* + o 4x4 Single precision micro-kernel fully functional. + o Runnable on ARMv8, compiled with aarch64 GCC. + o Use it together with the armv8 BLIS configuration. o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. + + December 2014. + + * UPDATE NOVEMBER 2015 + * Micro-kernel changed to 8x12 + * Tested on Juno Board. Around 8.1 GFLOPS, 1 x A57 core @ 1.1 GHz. + * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz. + * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_sgemm_opt_4x4( +void bli_sgemm_opt_8x12( dim_t k, float* restrict alpha, float* restrict a, @@ -50,9 +62,9 @@ void bli_sgemm_opt_4x4( { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - + dim_t k_iter = k / 4; - dim_t k_left = k % 4; + dim_t k_left = k % 4; __asm__ volatile ( @@ -62,10 +74,8 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" -" mov x4,#1 \n\t" // Init loop counter (i=0). -" \n\t" -" ldr x16,%[a_next] \n\t" // Pointer to next block of A. -" ldr x17,%[b_next] \n\t" // Pointer to next pointer of B. +" ldr x3,%[a_next] \n\t" // Pointer to next block of A. +" ldr x4,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). @@ -75,157 +85,367 @@ __asm__ volatile " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c. " lsl x10,x9,#2 \n\t" // cs_c * sizeof(float) -- AUX. -" lsl x11,x9,#3 \n\t" // 2 * cs_c * sizeof(float) -- AUX. -" lsl x12,x9,#4 \n\t" // 3 * cs_c * sizeof(float) -- AUX. " \n\t" " ldr x13,%[rs_c] \n\t" // Load rs_c. " lsl x14,x13,#2 \n\t" // rs_c * sizeof(float). -" \n\t" -" ldp q0,q1,[x0,0] \n\t" // Preload columns a,a+1 into two quads. -" ldp q4,q5,[x1,0] \n\t" // Preload rows b,b+1 into two quads. " \n\t" -" prfm pldl1keep,[x2,0] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c. +" add x16,x2,x10 \n\t" //Load address Column 1 of C +" add x17,x16,x10 \n\t" //Load address Column 2 of C +" add x18,x17,x10 \n\t" //Load address Column 3 of C +" add x19,x18,x10 \n\t" //Load address Column 4 of C +" add x20,x19,x10 \n\t" //Load address Column 5 of C +" add x21,x20,x10 \n\t" //Load address Column 6 of C +" add x22,x21,x10 \n\t" //Load address Column 7 of C +" add x23,x22,x10 \n\t" //Load address Column 8 of C +" add x24,x23,x10 \n\t" //Load address Column 9 of C +" add x25,x24,x10 \n\t" //Load address Column 10 of C +" add x26,x25,x10 \n\t" //Load address Column 11 of C " \n\t" -" \n\t" // Vectors for result columns. -" movi v8.4s,#0 \n\t" // Vector for result column 0. -" movi v9.4s,#0 \n\t" // Vector for result column 1. -" movi v10.4s,#0 \n\t" // Vector for result column 2. -" movi v11.4s,#0 \n\t" // Vector for result column 3. +" ldr q0, [x0] \n\t" +" ldr q1, [x0, #16] \n\t" // Load a " \n\t" -" \n\t" // Replicating accum. vectors for unrolling. -" movi v12.4s,#0 \n\t" // Vector 1 for accummulating column 0. -" movi v13.4s,#0 \n\t" // Vector 1 for accummulating column 1. -" movi v14.4s,#0 \n\t" // Vector 1 for accummulating column 2. -" movi v15.4s,#0 \n\t" // Vector 1 for accummulating column 3. +" ldr q2, [x1] \n\t" // Load b +" ldr q3, [x1, #16] \n\t" +" ldr q4, [x1, #32] \n\t" " \n\t" -" movi v16.4s,#0 \n\t" // Vector 2 for accummulating column 0. -" movi v17.4s,#0 \n\t" // Vector 2 for accummulating column 1. -" movi v18.4s,#0 \n\t" // Vector 2 for accummulating column 2. -" movi v19.4s,#0 \n\t" // Vector 2 for accummulating column 3. +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x16] \n\t" // Prefetch c. +" prfm pldl1keep,[x17] \n\t" // Prefetch c. +" prfm pldl1keep,[x18] \n\t" // Prefetch c. +" prfm pldl1keep,[x19] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" movi v20.4s,#0 \n\t" // Vector 3 for accummulating column 0. -" movi v21.4s,#0 \n\t" // Vector 3 for accummulating column 1. -" movi v22.4s,#0 \n\t" // Vector 3 for accummulating column 2. -" movi v23.4s,#0 \n\t" // Vector 3 for accummulating column 3. +" dup v8.4s, wzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #192] \n\t" +" dup v9.4s, wzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" +" dup v10.4s, wzr \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #320] \n\t" +" dup v11.4s, wzr \n\t" // Vector for accummulating column 1 +" dup v12.4s, wzr \n\t" // Vector for accummulating column 2 +" dup v13.4s, wzr \n\t" // Vector for accummulating column 2 " \n\t" -" movi v24.4s,#0 \n\t" // Vector 4 for accummulating column 0. -" movi v25.4s,#0 \n\t" // Vector 4 for accummulating column 1. -" movi v26.4s,#0 \n\t" // Vector 4 for accummulating column 2. -" movi v27.4s,#0 \n\t" // Vector 4 for accummulating column 3. +" dup v14.4s, wzr \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #128] \n\t" +" dup v15.4s, wzr \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #192] \n\t" +" dup v16.4s, wzr \n\t" // Vector for accummulating column 4 +" dup v17.4s, wzr \n\t" // Vector for accummulating column 4 +" dup v18.4s, wzr \n\t" // Vector for accummulating column 5 +" dup v19.4s, wzr \n\t" // Vector for accummulating column 5 " \n\t" -" ld1r {v31.4s},[x8] \n\t" // Load beta into quad. +" dup v20.4s, wzr \n\t" // Vector for accummulating column 6 +" dup v21.4s, wzr \n\t" // Vector for accummulating column 6 +" dup v22.4s, wzr \n\t" // Vector for accummulating column 7 +" dup v23.4s, wzr \n\t" // Vector for accummulating column 7 +" dup v24.4s, wzr \n\t" // Vector for accummulating column 8 +" dup v25.4s, wzr \n\t" // Vector for accummulating column 8 +" \n\t" +" dup v26.4s, wzr \n\t" // Vector for accummulating column 9 +" dup v27.4s, wzr \n\t" // Vector for accummulating column 9 +" dup v28.4s, wzr \n\t" // Vector for accummulating column 10 +" dup v29.4s, wzr \n\t" // Vector for accummulating column 10 +" dup v30.4s, wzr \n\t" // Vector for accummulating column 11 +" dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .SCONSIDERKLEFT \n\t" " \n\t" +"add x0, x0, #32 \n\t" //update address of A +"add x1, x1, #48 \n\t" //update address of B +" \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .SLASTITER \n\t" // (as loop is do-while-like). " \n\t" " .SLOOPKITER: \n\t" // Body of the k_iter loop. " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q5, [x0] \n\t" +" fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s, v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #16] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1] \n\t" " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #336] \n\t" +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #400] \n\t" +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #464] \n\t" +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" ldp q6,q7,[x1,32] \n\t" // Load rows b+2,b+3 into quads. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #16] \n\t" " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #32] \n\t" +" \n\t" //End It 1 " \n\t" -" ldp q2,q3,[x0,32] \n\t" // Load columns a+2,a+3 into quads. +" ldr q0, [x0, #32] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #48] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #48] \n\t" " \n\t" -" fmla v16.4s,v1.4s,v5.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v5.s[1] \n\t" // Accummulate. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x0, #224] \n\t" +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x0, #288] \n\t" +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v18.4s,v1.4s,v5.s[2] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v5.s[3] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #64] \n\t" " \n\t" -" add x0,x0,64 \n\t" // Update a_ptr. -" add x1,x1,64 \n\t" // Update b_ptr. +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #80] \n\t" +" \n\t" //End It 2 " \n\t" -" fmla v20.4s,v2.4s,v6.s[0] \n\t" // Accummulate. -" fmla v21.4s,v2.4s,v6.s[1] \n\t" // Accummulate. +" ldr q5, [x0, #64] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #80] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #96] \n\t" " \n\t" -" ldp q0,q1,[x0] \n\t" // Load columns a,a+1 into quads (next iteration). +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v22.4s,v2.4s,v6.s[2] \n\t" // Accummulate. -" fmla v23.4s,v2.4s,v6.s[3] \n\t" // Accummulate. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #112] \n\t" " \n\t" -" ldp q4,q5,[x1] \n\t" // Load rows b,b+1 into quads (next iteration). +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #128] \n\t" +" \n\t" //End It 3 " \n\t" -" fmla v24.4s,v3.4s,v7.s[0] \n\t" // Accummulate. -" fmla v25.4s,v3.4s,v7.s[1] \n\t" // Accummulate. +" ldr q0, [x0, #96] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #112] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #144] \n\t" " \n\t" -" prfm pldl1keep,[x0,#64] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#64] \n\t" // Prefetch. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v26.4s,v3.4s,v7.s[2] \n\t" // Accummulate. -" fmla v27.4s,v3.4s,v7.s[3] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #160] \n\t" " \n\t" +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #176] \n\t" +" add x1, x1, #192 \n\t" +" add x0, x0, #128 \n\t" +" \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. " bne .SLOOPKITER \n\t" " \n\t" -//" prfm pldl1keep,[x0,#1024] \n\t" -//" prfm pldl1keep,[x1,#1024] \n\t" -" \n\t" " .SLASTITER: \n\t" // Last iteration of k_iter loop. " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " \n\t" -" ldp q6,q7,[x1,32] \n\t" // Load rows b+2,b+3 into quads. +" ldr q5, [x0] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #16] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1] \n\t" " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" ldp q2,q3,[x0,32] \n\t" // Load columns a+2,a+3 into quads. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #16] \n\t" " \n\t" -" fmla v16.4s,v1.4s,v5.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v5.s[1] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #32] \n\t" +" \n\t" //End It 1 " \n\t" -" ld1r {v30.4s},[x7] \n\t" // Load alpha. +" ldr q0, [x0, #32] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #48] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #48] \n\t" " \n\t" -" fmla v18.4s,v1.4s,v5.s[2] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v5.s[3] \n\t" // Accummulate. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v20.4s,v2.4s,v6.s[0] \n\t" // Accummulate. -" fmla v21.4s,v2.4s,v6.s[1] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v22.4s,v2.4s,v6.s[2] \n\t" // Accummulate. -" fmla v23.4s,v2.4s,v6.s[3] \n\t" // Accummulate. +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #80] \n\t" +" \n\t" //End It 2 " \n\t" -" fmla v24.4s,v3.4s,v7.s[0] \n\t" // Accummulate. -" fmla v25.4s,v3.4s,v7.s[1] \n\t" // Accummulate. +" ldr q5, [x0, #64] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #80] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #96] \n\t" " \n\t" -" fmla v26.4s,v3.4s,v7.s[2] \n\t" // Accummulate. -" fmla v27.4s,v3.4s,v7.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -//" ld1 {v8.4s},[x2],x10 \n\t" // Load c into quad and increment by cs_c -//" ld1 {v9.4s},[x2],x10 \n\t" // Load c+4 into quad and increment by cs_c -//" ld1 {v10.4s},[x2],x10 \n\t" // Load c+8 into quad and increment by cs_c -//" ld1 {v11.4s},[x2],x10 \n\t" // Load c+16 into quad and increment by cs_c +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #112] \n\t" " \n\t" -" fadd v12.4s,v12.4s,v16.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v17.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v18.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v19.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v12.4s,v12.4s,v20.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v21.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v22.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v23.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v12.4s,v12.4s,v24.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v25.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v26.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v27.4s \n\t" // Final accummulate of temporal accum. vectors. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #128] \n\t" +" \n\t" //End It 3 " \n\t" -" add x0,x0,64 \n\t" // Update a_ptr. -" add x1,x1,64 \n\t" // Update b_ptr. +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" add x1, x1, #144 \n\t" +" add x0, x0, #96 \n\t" +" \n\t" //End It 4 " \n\t" " .SCONSIDERKLEFT: \n\t" " cmp x6,0 \n\t" // If k_left == 0, we are done. @@ -233,165 +453,595 @@ __asm__ volatile " \n\t" " .SLOOPKLEFT: \n\t" // Body of the left iterations " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q0, [x0],#16 \n\t" +" ldr q1, [x0],#16 \n\t" // Load a " \n\t" -" ldr q0,[x0] \n\t" // Load a into quad (next iteration). -" ldr q4,[x1] \n\t" // Load b into quad (next iteration). -" \n\t" -" add x0,x0,16 \n\t" // Update a_ptr. -" add x1,x1,16 \n\t" // Update b_ptr. +" ldr q2, [x1],#16 \n\t" // Load b +" ldr q3, [x1],#16 \n\t" +" ldr q4, [x1],#16 \n\t" " \n\t" " sub x6,x6,1 \n\t" // i = i-1. " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. " bne .SLOOPKLEFT \n\t" // if i!=0. " \n\t" -" ld1r {v30.4s},[x7] \n\t" // Load alpha. -" \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" \n\t" " .SPOSTACCUM: \n\t" +" \n\t" +" ld1r {v6.4s},[x7] \n\t" // Load alpha. +" ld1r {v7.4s},[x8] \n\t" // Load beta +" \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) " bne .SGENSTORED \n\t" " \n\t" -" \n\t" " .SCOLSTORED: \n\t" // C is column-major. " \n\t" -" fcmp s31,#0.0 \n\t" -" beq .BETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case. +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" " \n\t" -" \n\t" // If beta!=0, then we can read from C. -" ld1 {v8.4s},[x2],x10 \n\t" // Load c into quad and increment by cs_c. -" ld1 {v9.4s},[x2],x10 \n\t" // Load c+4 into quad and increment by cs_c. -" ld1 {v10.4s},[x2],x10 \n\t" // Load c+8 into quad and increment by cs_c. -" ld1 {v11.4s},[x2],x10 \n\t" // Load c+16 into quad and increment by cs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" ldr q0, [x2] \n\t" //Load column 0 of C +" ldr q1, [x2, #16] \n\t" +" ldr q2, [x16] \n\t" //Load column 1 of C +" ldr q3, [x16, #16] \n\t" +" ldr q4, [x17] \n\t" //Load column 2 of C +" ldr q5, [x17, #16] \n\t" " \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" fmul v8.4s,v8.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v9.4s,v9.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v10.4s,v10.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v11.4s,v11.4s,v31.s[0] \n\t" // Scale by beta. +" .SBETAZEROCOLSTOREDS1: \n\t" " \n\t" -" .BETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale). +" fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" str q0, [x2] \n\t" //Store column 0 of C +" str q1, [x2, #16] \n\t" +" str q2, [x16] \n\t" //Store column 1 of C +" str q3, [x16, #16] \n\t" +" str q4, [x17] \n\t" //Store column 2 of C +" str q5, [x17, #16] \n\t" " \n\t" -" fmla v8.4s,v12.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v9.4s,v13.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v14.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v15.4s,v30.s[0] \n\t" // Scale by alpha +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x18] \n\t" //Load column 3 of C +" ldr q9, [x18, #16] \n\t" +" ldr q10, [x19] \n\t" //Load column 4 of C +" ldr q11, [x19, #16] \n\t" +" ldr q12, [x20] \n\t" //Load column 5 of C +" ldr q13, [x20, #16] \n\t" +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x18] \n\t" //Store column 3 of C +" str q9, [x18, #16] \n\t" +" str q10, [x19] \n\t" //Store column 4 of C +" str q11, [x19, #16] \n\t" +" str q12, [x20] \n\t" //Store column 5 of C +" str q13, [x20, #16] \n\t" +" \n\t" +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q0, [x21] \n\t" //Load column 6 of C +" ldr q1, [x21, #16] \n\t" +" ldr q2, [x22] \n\t" //Load column 7 of C +" ldr q3, [x22, #16] \n\t" +" ldr q4, [x23] \n\t" //Load column 8 of C +" ldr q5, [x23, #16] \n\t" +" \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q0, [x21] \n\t" //Store column 6 of C +" str q1, [x21, #16] \n\t" +" str q2, [x22] \n\t" //Store column 7 of C +" str q3, [x22, #16] \n\t" +" str q4, [x23] \n\t" //Store column 8 of C +" str q5, [x23, #16] \n\t" +" \n\t" +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x24] \n\t" //Load column 9 of C +" ldr q9, [x24, #16] \n\t" +" ldr q10, [x25] \n\t" //Load column 10 of C +" ldr q11, [x25, #16] \n\t" +" ldr q12, [x26] \n\t" //Load column 11 of C +" ldr q13, [x26, #16] \n\t" +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x24] \n\t" //Store column 9 of C +" str q9, [x24, #16] \n\t" +" str q10, [x25] \n\t" //Store column 10 of C +" str q11, [x25, #16] \n\t" +" str q12, [x26] \n\t" //Store column 11 of C +" str q13, [x26, #16] \n\t" " \n\t" -" st1 {v8.4s},[x2],x10 \n\t" // Store quad into c and increment by cs_c -" st1 {v9.4s},[x2],x10 \n\t" // Store quad into c+4 and increment by cs_c -" st1 {v10.4s},[x2],x10 \n\t" // Store quad into c+8 and increment by cs_c -" st1 {v11.4s},[x2],x10 \n\t" // Store quad into c+16 and increment by cs_c " \n\t" " b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). " \n\t" " \n\t" " .SGENSTORED: \n\t" // C is general-stride stored. " \n\t" -" fcmp s31,#0.0 \n\t" -" beq .BETAZEROGENSTORED \n\t" " \n\t" -" \n\t" // If beta!=0, then we can read from C. -" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads. -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" " \n\t" -" ld1 {v8.s}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x2 \n\t" " \n\t" -" ld1 {v9.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c06 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c07 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x16 \n\t" " \n\t" -" ld1 {v10.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c16 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c17 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x17 \n\t" " \n\t" -" ld1 {v11.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c26 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c27 into quad and increment by rs_c. " \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" .SBETAZEROGENSTOREDS1: \n\t" " \n\t" -" fmul v8.4s,v8.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v9.4s,v9.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v10.4s,v10.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v11.4s,v11.4s,v31.s[0] \n\t" // Scale by beta. +" fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" .BETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale). +" mov x27, x2 \n\t" " \n\t" -" fmla v8.4s,v12.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v9.4s,v13.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v10.4s,v14.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v11.4s,v15.4s,v30.s[0] \n\t" // Scale by alpha. +" st1 {v0.s}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.s}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v0.s}[2],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v0.s}[3],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v1.s}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v1.s}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. +" st1 {v1.s}[2],[x27],x14 \n\t" // Store c06 into quad and increment by rs_c. +" st1 {v1.s}[3],[x27],x14 \n\t" // Store c07 into quad and increment by rs_c. " \n\t" +" mov x27, x16 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" st1 {v2.s}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v2.s}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v2.s}[2],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v2.s}[3],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v3.s}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v3.s}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. +" st1 {v3.s}[2],[x27],x14 \n\t" // Store c16 into quad and increment by rs_c. +" st1 {v3.s}[3],[x27],x14 \n\t" // Store c17 into quad and increment by rs_c. " \n\t" -" st1 {v8.s}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v8.s}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v8.s}[2],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v8.s}[3],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c. +" mov x27, x17 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" st1 {v4.s}[0],[x27],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v4.s}[1],[x27],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v4.s}[2],[x27],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v4.s}[3],[x27],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v5.s}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v5.s}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. +" st1 {v5.s}[2],[x27],x14 \n\t" // Store c26 into quad and increment by rs_c. +" st1 {v5.s}[3],[x27],x14 \n\t" // Store c27 into quad and increment by rs_c. " \n\t" -" st1 {v9.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v9.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v9.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v9.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" -" st1 {v10.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v10.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v10.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v10.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" mov x27, x18 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c36 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c37 into quad and increment by rs_c. " \n\t" -" st1 {v11.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v11.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v11.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v11.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" mov x27, x19 \n\t" " \n\t" +" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c46 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c47 into quad and increment by rs_c. " \n\t" +" mov x27, x20 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c56 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c57 into quad and increment by rs_c. +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x18 \n\t" +" \n\t" +" st1 {v8.s}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v8.s}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v8.s}[2],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v8.s}[3],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v9.s}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v9.s}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. +" st1 {v9.s}[2],[x27],x14 \n\t" // Store c36 into quad and increment by rs_c. +" st1 {v9.s}[3],[x27],x14 \n\t" // Store c37 into quad and increment by rs_c. +" \n\t" +" mov x27, x19 \n\t" +" \n\t" +" st1 {v10.s}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v10.s}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v10.s}[2],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v10.s}[3],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v11.s}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v11.s}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. +" st1 {v11.s}[2],[x27],x14 \n\t" // Store c46 into quad and increment by rs_c. +" st1 {v11.s}[3],[x27],x14 \n\t" // Store c47 into quad and increment by rs_c. +" \n\t" +" mov x27, x20 \n\t" +" \n\t" +" st1 {v12.s}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v12.s}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v12.s}[2],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v12.s}[3],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v13.s}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v13.s}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. +" st1 {v13.s}[2],[x27],x14 \n\t" // Store c56 into quad and increment by rs_c. +" st1 {v13.s}[3],[x27],x14 \n\t" // Store c57 into quad and increment by rs_c. +" \n\t" +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x21 \n\t" +" \n\t" +" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c66 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c67 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" +" \n\t" +" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c76 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c77 into quad and increment by rs_c. +" \n\t" +" mov x27, x23 \n\t" +" \n\t" +" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c80 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c81 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c82 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c83 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c84 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c85 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c86 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c87 into quad and increment by rs_c. +" \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x21 \n\t" +" \n\t" +" st1 {v0.s}[0],[x27],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v0.s}[1],[x27],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v0.s}[2],[x27],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v0.s}[3],[x27],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v1.s}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v1.s}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. +" st1 {v1.s}[2],[x27],x14 \n\t" // Store c66 into quad and increment by rs_c. +" st1 {v1.s}[3],[x27],x14 \n\t" // Store c67 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" +" \n\t" +" st1 {v2.s}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v2.s}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v2.s}[2],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v2.s}[3],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v3.s}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v3.s}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. +" st1 {v3.s}[2],[x27],x14 \n\t" // Store c76 into quad and increment by rs_c. +" st1 {v3.s}[3],[x27],x14 \n\t" // Store c77 into quad and increment by rs_c. +" \n\t" +" mov x27, x23 \n\t" +" \n\t" +" st1 {v4.s}[0],[x27],x14 \n\t" // Store c80 into quad and increment by rs_c. +" st1 {v4.s}[1],[x27],x14 \n\t" // Store c81 into quad and increment by rs_c. +" st1 {v4.s}[2],[x27],x14 \n\t" // Store c82 into quad and increment by rs_c. +" st1 {v4.s}[3],[x27],x14 \n\t" // Store c83 into quad and increment by rs_c. +" st1 {v5.s}[0],[x27],x14 \n\t" // Store c84 into quad and increment by rs_c. +" st1 {v5.s}[1],[x27],x14 \n\t" // Store c85 into quad and increment by rs_c. +" st1 {v5.s}[2],[x27],x14 \n\t" // Store c86 into quad and increment by rs_c. +" st1 {v5.s}[3],[x27],x14 \n\t" // Store c87 into quad and increment by rs_c. +" \n\t" +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x24 \n\t" +" \n\t" +" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c90 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c91 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c92 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c93 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c94 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c95 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c96 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c97 into quad and increment by rs_c. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c100 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c101 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c102 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c103 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c104 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c105 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c106 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c107 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c110 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c111 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c112 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c113 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c114 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c115 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c116 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c117 into quad and increment by rs_c. +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x24 \n\t" +" \n\t" +" st1 {v8.s}[0],[x27],x14 \n\t" // Store c90 into quad and increment by rs_c. +" st1 {v8.s}[1],[x27],x14 \n\t" // Store c91 into quad and increment by rs_c. +" st1 {v8.s}[2],[x27],x14 \n\t" // Store c92 into quad and increment by rs_c. +" st1 {v8.s}[3],[x27],x14 \n\t" // Store c93 into quad and increment by rs_c. +" st1 {v9.s}[0],[x27],x14 \n\t" // Store c94 into quad and increment by rs_c. +" st1 {v9.s}[1],[x27],x14 \n\t" // Store c95 into quad and increment by rs_c. +" st1 {v9.s}[2],[x27],x14 \n\t" // Store c96 into quad and increment by rs_c. +" st1 {v9.s}[3],[x27],x14 \n\t" // Store c97 into quad and increment by rs_c. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" st1 {v10.s}[0],[x27],x14 \n\t" // Store c100 into quad and increment by rs_c. +" st1 {v10.s}[1],[x27],x14 \n\t" // Store c101 into quad and increment by rs_c. +" st1 {v10.s}[2],[x27],x14 \n\t" // Store c102 into quad and increment by rs_c. +" st1 {v10.s}[3],[x27],x14 \n\t" // Store c103 into quad and increment by rs_c. +" st1 {v11.s}[0],[x27],x14 \n\t" // Store c104 into quad and increment by rs_c. +" st1 {v11.s}[1],[x27],x14 \n\t" // Store c105 into quad and increment by rs_c. +" st1 {v11.s}[2],[x27],x14 \n\t" // Store c106 into quad and increment by rs_c. +" st1 {v11.s}[3],[x27],x14 \n\t" // Store c107 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" +" \n\t" +" st1 {v12.s}[0],[x27],x14 \n\t" // Store c110 into quad and increment by rs_c. +" st1 {v12.s}[1],[x27],x14 \n\t" // Store c111 into quad and increment by rs_c. +" st1 {v12.s}[2],[x27],x14 \n\t" // Store c112 into quad and increment by rs_c. +" st1 {v12.s}[3],[x27],x14 \n\t" // Store c113 into quad and increment by rs_c. +" st1 {v13.s}[0],[x27],x14 \n\t" // Store c114 into quad and increment by rs_c. +" st1 {v13.s}[1],[x27],x14 \n\t" // Store c115 into quad and increment by rs_c. +" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. +" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" " .SEND: \n\t" // Done! " \n\t" @@ -410,10 +1060,13 @@ __asm__ volatile [b_next] "m" (b_next), // 10 [k] "m" (k) // 11 :// Register clobber list - "x0", "x1", "x2", "x4", + "x0", "x1", "x2","x3","x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12", - "x13","x14","x20", + "x13","x14","x15", + "x16","x17","x18","x19", + "x20","x21","x22","x23", + "x24","x25","x26","x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11", @@ -421,17 +1074,32 @@ __asm__ volatile "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", - "v30","v31" + "v28","v29","v30","v31" ); } /* + o 4x4 Double precision micro-kernel NOT fully functional yet. + o Runnable on ARMv8, compiled with aarch64 GCC. + o Use it together with the armv8 BLIS configuration. + o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. + + December 2014. + + * UPDATE OCTOBER 2015: Now is fully functional. * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz. * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz. + + * UPDATE NOVEMBER 2015 + * Micro-kernel changed to 6x8 + * Tested on Juno Board. Around 4 GFLOPS, 1 x A57 core @ 1.1 GHz. + * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz. + * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_dgemm_opt_4x4( +void bli_dgemm_opt_6x8( dim_t k, double* restrict alpha, double* restrict a, @@ -444,8 +1112,8 @@ void bli_dgemm_opt_4x4( void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 2; - dim_t k_left = k % 2; + dim_t k_iter = k / 4; + dim_t k_left = k % 4; __asm__ volatile ( @@ -454,10 +1122,8 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" -" mov x4,#0 \n\t" // Init loop counter (i=0) -" \n\t" -" ldr x16,%[a_next] \n\t" // Move pointer -" ldr x17,%[b_next] \n\t" // Move pointer +" ldr x3,%[a_next] \n\t" // Move pointer +" ldr x4,%[b_next] \n\t" // Move pointer " \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) @@ -467,123 +1133,414 @@ __asm__ volatile " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c " lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) -" lsl x11,x9,#4 \n\t" // 2 * cs_c * sizeof(double) -- AUX. -" lsl x12,x9,#5 \n\t" // 3 * cs_c * sizeof(double) -- AUX. " \n\t" " ldr x13,%[rs_c] \n\t" // Load rs_c. " lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). " \n\t" -" prfm pldl1keep,[x2,0] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c. +" add x20,x2,x10 \n\t" //Load address Column 1 of C +" add x21,x20,x10 \n\t" //Load address Column 2 of C +" add x22,x21,x10 \n\t" //Load address Column 3 of C +" add x23,x22,x10 \n\t" //Load address Column 4 of C +" add x24,x23,x10 \n\t" //Load address Column 5 of C +" add x25,x24,x10 \n\t" //Load address Column 6 of C +" add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" -" movi v12.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v13.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v14.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v15.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v16.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v17.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v18.2d,#0 \n\t" // Vector for accummulating column 3 -" movi v19.2d,#0 \n\t" // Vector for accummulating column 3 +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" movi v20.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v21.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v22.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v23.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v24.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v25.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v26.2d,#0 \n\t" // Vector for accummulating column 3 -" movi v27.2d,#0 \n\t" // Vector for accummulating column 3 +" ldr q0, [x0] \n\t" +" ldr q1, [x0, #16] \n\t" // Load a +" ldr q2, [x0, #32] \n\t" " \n\t" -" ld1r {v31.2d},[x8] \n\t" // Load beta +" ldr q3, [x1] \n\t" // Load b +" ldr q4, [x1, #16] \n\t" +" ldr q5, [x1, #32] \n\t" +" ldr q6, [x1, #48] \n\t" +" \n\t" +" dup v8.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" +" dup v9.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #320] \n\t" +" dup v10.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #384] \n\t" +" dup v11.2d, xzr \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #448] \n\t" +" dup v12.2d, xzr \n\t" // Vector for accummulating column 1 +" dup v13.2d, xzr \n\t" // Vector for accummulating column 1 +" \n\t" +" dup v14.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #192] \n\t" +" dup v15.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #256] \n\t" +" dup v16.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #320] \n\t" +" dup v17.2d, xzr \n\t" // Vector for accummulating column 3 +" dup v18.2d, xzr \n\t" // Vector for accummulating column 3 +" dup v19.2d, xzr \n\t" // Vector for accummulating column 3 +" \n\t" +" dup v20.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v21.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v22.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v23.2d, xzr \n\t" // Vector for accummulating column 5 +" dup v24.2d, xzr \n\t" // Vector for accummulating column 5 +" dup v25.2d, xzr \n\t" // Vector for accummulating column 5 +" \n\t" +" dup v26.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v27.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v28.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v29.2d, xzr \n\t" // Vector for accummulating column 7 +" dup v30.2d, xzr \n\t" // Vector for accummulating column 7 +" dup v31.2d, xzr \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .DCONSIDERKLEFT \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a -" ldp q4,q5,[x1],32 \n\t" // Load b +"add x0, x0, #48 \n\t" //update address of A +"add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .DLASTITER \n\t" // (as loop is do-while-like). -" \n\t" " \n\t" " DLOOP: \n\t" // Body " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #512] \n\t" +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #576] \n\t" " \n\t" -" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1] \n\t" " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q7, [x0, #32] \n\t" " \n\t" -" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #16] \n\t" " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #32] \n\t" " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0] \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a into quad +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #16] \n\t" " \n\t" -" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #48] \n\t" +" \n\t" // End it 1 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #640] \n\t" +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #336] \n\t" +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #400] \n\t" " \n\t" -" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate -" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate -" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q2, [x0, #80] \n\t" " \n\t" -" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate -" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #80] \n\t" " \n\t" -" prfm pldl1keep,[x0,#64] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#64] \n\t" // Prefetch. +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #96] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #48] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #64] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #112] \n\t" +" \n\t" //End it 2 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #464] \n\t" +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1, #128] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q7, [x0, #128] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #144] \n\t" +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #160] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #96] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #112] \n\t" +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #176] \n\t" +" \n\t" // End it 3 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #192] \n\t" +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q2, [x0, #176] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #208] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #224] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #144] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #160] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #240] \n\t" +" \n\t" //End it 4 +" add x0, x0, #192 \n\t" +" add x1, x1, #256 \n\t" " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -"bne DLOOP \n\t" +" bne DLOOP \n\t" " \n\t" ".DLASTITER: \n\t" " \n\t" -" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1] \n\t" " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q7, [x0, #32] \n\t" " \n\t" -" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #16] \n\t" " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #32] \n\t" " \n\t" -" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0] \n\t" " \n\t" -" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #16] \n\t" " \n\t" -" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate -" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #48] \n\t" +" \n\t" // End it 1 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate -" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate -" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q2, [x0, #80] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #80] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #96] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #48] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #64] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #112] \n\t" +" \n\t" //End it 2 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #128] \n\t" +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q7, [x0, #128] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #144] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #160] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #96] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #112] \n\t" +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #176] \n\t" +" \n\t" // End it 3 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" add x1, x1, #192 \n\t" +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" \n\t" //End it 4 +" add x0, x0, #144 \n\t" " \n\t" " .DCONSIDERKLEFT: \n\t" " cmp x6,0 \n\t" // If k_left == 0, we are done. @@ -591,182 +1548,488 @@ __asm__ volatile " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q0, [x0],#16 \n\t" +" ldr q1, [x0],#16 \n\t" // Load a +" ldr q2, [x0],#16 \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a into quad -" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" ldr q3, [x1],#16 \n\t" // Load b +" ldr q4, [x1],#16 \n\t" +" ldr q5, [x1],#16 \n\t" +" ldr q6, [x1],#16 \n\t" " \n\t" -//" sub x6,x6,1 \n\t" +" sub x6,x6,1 \n\t" " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " \n\t" -//" cmp x6,0 \n\t" // Iterate again. -//" bne .DLOOPKLEFT \n\t" // if i!=0. +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" cmp x6,0 \n\t" // Iterate again. +" bne .DLOOPKLEFT \n\t" // if i!=0. " \n\t" " .DPOSTACCUM: \n\t" -" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" \n\t" +" ld1r {v6.2d},[x7] \n\t" // Load alpha. +" ld1r {v7.2d},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) " bne .DGENSTORED \n\t" " \n\t" " .DCOLSTORED: \n\t" // C is column-major. -" fcmp d31,#0.0 \n\t" -" beq .DBETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" " \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldp q0,q1,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q2,q3,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q4,q5,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q6,q7,[x2] \n\t" // Load c into quad and increment by cs_c +" ldr q0, [x2] \n\t" //Load column 0 of C +" ldr q1, [x2, #16] \n\t" +" ldr q2, [x2, #32] \n\t" " \n\t" +" ldr q3, [x20] \n\t" //Load column 1 of C +" ldr q4, [x20, #16] \n\t" +" ldr q5, [x20, #32] \n\t" " \n\t" -" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta -" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta -" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta -" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta -" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta -" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta -" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta -" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" prfm pldl2keep,[x16] \n\t" -" prfm pldl2keep,[x17] \n\t" +" .DBETAZEROCOLSTOREDS1: \n\t" " \n\t" -" .DBETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale). +" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C +" str q0, [x2] \n\t" //Store column 0 of C +" str q1, [x2, #16] \n\t" +" str q2, [x2, #32] \n\t" " \n\t" -" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" str q3, [x20] \n\t" //Store column 1 of C +" str q4, [x20, #16] \n\t" +" str q5, [x20, #32] \n\t" " \n\t" -" stp q20,q21,[x2] \n\t" // Store quad into c and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q22,q23,[x2] \n\t" // Store quad into c+4 and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q24,q25,[x2] \n\t" // Store quad into c+8 and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q26,q27,[x2] \n\t" // Store quad into c+16 and increment by cs_c +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" " \n\t" -" b .DEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x21] \n\t" //Load column 2 of C +" ldr q9, [x21, #16] \n\t" +" ldr q10, [x21, #32] \n\t" +" \n\t" +" ldr q11, [x22] \n\t" //Load column 3 of C +" ldr q12, [x22, #16] \n\t" +" ldr q13, [x22, #32] \n\t" +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x21] \n\t" //Store column 2 of C +" str q9, [x21, #16] \n\t" +" str q10, [x21, #32] \n\t" +" \n\t" +" str q11, [x22] \n\t" //Store column 3 of C +" str q12, [x22, #16] \n\t" +" str q13, [x22, #32] \n\t" +" \n\t" +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q0, [x23] \n\t" //Load column 4 of C +" ldr q1, [x23, #16] \n\t" +" ldr q2, [x23, #32] \n\t" +" \n\t" +" ldr q3, [x24] \n\t" //Load column 5 of C +" ldr q4, [x24, #16] \n\t" +" ldr q5, [x24, #32] \n\t" +" \n\t" +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q0, [x23] \n\t" //Store column 4 of C +" str q1, [x23, #16] \n\t" +" str q2, [x23, #32] \n\t" +" \n\t" +" str q3, [x24] \n\t" //Store column 5 of C +" str q4, [x24, #16] \n\t" +" str q5, [x24, #32] \n\t" +" \n\t" +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x25] \n\t" //Load column 6 of C +" ldr q9, [x25, #16] \n\t" +" ldr q10, [x25, #32] \n\t" +" \n\t" +" ldr q11, [x26] \n\t" //Load column 7 of C +" ldr q12, [x26, #16] \n\t" +" ldr q13, [x26, #32] \n\t" +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x25] \n\t" //Store column 6 of C +" str q9, [x25, #16] \n\t" +" str q10, [x25, #32] \n\t" +" \n\t" +" str q11, [x26] \n\t" //Store column 7 of C +" str q12, [x26, #16] \n\t" +" str q13, [x26, #32] \n\t" +" \n\t" +" b .DEND \n\t" " \n\t" " .DGENSTORED: \n\t" // C is general-stride stored. " \n\t" -" fcmp d31,#0.0 \n\t" -" beq .DBETAZEROGENSTORED \n\t" -" \n\t" // If beta!=0, then we can read from C. -" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads. -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" " \n\t" -" ld1 {v0.d}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.d}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v1.d}[0],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v1.d}[1],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c. +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x2 \n\t" +" \n\t" // Load address of C. +" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. " \n\t" -" ld1 {v2.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v2.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v3.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v3.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" mov x27, x20 \n\t" // Load address of C. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. " \n\t" -" ld1 {v4.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v4.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v5.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v5.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" .DBETAZEROGENSTOREDS1: \n\t" " \n\t" -" ld1 {v6.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v6.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v7.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v7.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" mov x27, x2 \n\t" // Load address of C. " \n\t" -" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta -" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta -" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta -" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta -" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta -" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta -" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta -" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. " \n\t" -" .DBETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale). +" mov x27, x20 \n\t" // Load address of C. " \n\t" -" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" " \n\t" -" st1 {v20.d}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v20.d}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v21.d}[0],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v21.d}[1],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c. +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x21 \n\t" // Load address of C. " \n\t" -" st1 {v22.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v22.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v23.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v23.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x22 \n\t" // Load address of C. " \n\t" -" st1 {v24.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v24.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v25.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v25.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" st1 {v26.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v26.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v27.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v27.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" .DBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x21 \n\t" // Load address of C. +" \n\t" +" st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" // Load address of C. +" \n\t" +" st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. +" \n\t" +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x23 \n\t" // Load address of C. +" \n\t" +" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. +" \n\t" +" mov x27, x24 \n\t" // Load address of C. +" \n\t" +" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. +" \n\t" +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x23 \n\t" // Load address of C. +" \n\t" +" st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. +" \n\t" +" mov x27, x24 \n\t" // Load address of C. +" \n\t" +" st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. +" \n\t" +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" // Load address of C. +" \n\t" +" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x25 \n\t" // Load address of C. +" \n\t" +" st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" // Load address of C. +" \n\t" +" st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. " \n\t" " .DEND: \n\t" // Done! " \n\t" @@ -784,10 +2047,12 @@ __asm__ volatile [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list - "x0","x1","x2", + "x0","x1","x2","x3", "x4","x5","x6", "x7","x8","x9", "x10","x11","x12","x13","x14","x16","x17", + "x20","x21","x22","x23","x24","x25","x26", + "x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", @@ -796,7 +2061,7 @@ __asm__ volatile "v15","v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", - "v30","v31" + "v28","v29","v30","v31" ); From 2bd036f1f9ce1ee0864365557f66d9415dd42de3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 25 Mar 2016 12:16:49 -0500 Subject: [PATCH 04/10] Fix configuration issue where instruction set flags are not specified for debug builds. --- config/armv7a/make_defs.mk | 23 +++++++++++++++++++---- config/armv8a/make_defs.mk | 24 +++++++++++++++++++----- config/bulldozer/make_defs.mk | 21 ++++++++++++++++++--- config/carrizo/make_defs.mk | 21 ++++++++++++++++++--- config/cortex-a15/make_defs.mk | 7 ++----- config/cortex-a9/make_defs.mk | 7 ++----- config/dunnington/make_defs.mk | 7 ++----- config/haswell/make_defs.mk | 7 ++----- config/loongson3a/make_defs.mk | 7 ++----- config/mic/make_defs.mk | 3 --- config/piledriver/make_defs.mk | 7 ++----- config/power7/make_defs.mk | 3 --- config/reference/make_defs.mk | 3 --- config/sandybridge/make_defs.mk | 7 ++----- config/template/make_defs.mk | 3 --- configure | 3 ++- 16 files changed, 90 insertions(+), 63 deletions(-) diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index a28ad0878..ff1d84b0d 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -76,17 +76,32 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- +ifeq ($(CC),) CC := gcc +CC_VENDOR := gcc +endif +ifneq ($(CC_VENDOR),gcc) +$(error gcc is required for this configuration.) +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g +CMISCFLAGS := -std=c99 -mfloat-abi=hard CPICFLAGS := -fPIC -CDBGFLAGS := #-g CWARNFLAGS := -Wall -COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +CVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a CKOPTFLAGS := $(COPTFLAGS) -CVECFLAGS := #-msse3 # -mfpmath=sse # Aggregate all of the flags into multiple groups: one for standard # compilation, and one for each of the supported "special" compilation diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 63c03c6a0..3cec43c92 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -76,18 +76,32 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- +ifeq ($(CC),) CC := gcc - +CC_VENDOR := gcc +endif +ifneq ($(CC_VENDOR),gcc) +$(error gcc is required for this configuration.) +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_GNU_SOURCE -CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 +CMISCFLAGS := -std=c99 -fopenmp CPICFLAGS := -fPIC -CDBGFLAGS := -g #-g3 -gdwarf-2 CWARNFLAGS := -Wall -COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 -ftree-vectorize -mtune=cortex-a57.cortex-a53 +endif + +CVECFLAGS := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53 CKOPTFLAGS := $(COPTFLAGS) -CVECFLAGS := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 # Aggregate all of the flags into multiple groups: one for standard # compilation, and one for each of the supported "special" compilation diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 13e306a02..772ac1c53 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -76,17 +76,32 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- +ifeq ($(CC),) CC := gcc +CC_VENDOR := gcc +endif +ifneq ($(CC_VENDOR),gcc) +$(error gcc is required for this configuration.) +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 -fopenmp CPICFLAGS := -fPIC -CDBGFLAGS := -g CWARNFLAGS := -Wall -COPTFLAGS := -O0 -malign-double -funroll-all-loops -CKOPTFLAGS := $(COPTFLAGS) + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 -malign-double -funroll-all-loops +endif + CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse +CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard # compilation, and one for each of the supported "special" compilation diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 5f5303ade..08a5baaaf 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -76,17 +76,32 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- +ifeq ($(CC),) CC := gcc +CC_VENDOR := gcc +endif +ifneq ($(CC_VENDOR),gcc) +$(error gcc is required for this configuration.) +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 -fopenmp CPICFLAGS := -fPIC -CDBGFLAGS := #-g CWARNFLAGS := -Wall -COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 -fomit-frame-pointer +endif + +CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -CVECFLAGS := -mavx -mfma -march=native # Aggregate all of the flags into multiple groups: one for standard # compilation, and one for each of the supported "special" compilation diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 078c063b7..e81c28f60 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := #-msse3 -march=native # -mfpmath=sse +COPTFLAGS := -O2 endif +CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 7dbc0aa77..e81c28f60 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := #-msse3 -march=native # -mfpmath=sse +COPTFLAGS := -O2 endif +CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index d065640f5..4d06567d0 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := -msse3 -march=native +COPTFLAGS := -O2 -fomit-frame-pointer endif +CVECFLAGS := -msse3 -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index a1865a98b..895746fc5 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O3 -march=native -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := -mavx2 -mfma -mfpmath=sse #-msse3 -march=native # -mfpmath=sse +COPTFLAGS := -O3 endif +CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index d3500a7c3..8296dcd92 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O3 -march=loongson3a -mtune=loongson3a -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := #-msse3 -march=native # -mfpmath=sse +COPTFLAGS := -O3 -mtune=loongson3a endif +CVECFLAGS := -march=loongson3a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 93e1efb9a..5e298269d 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -100,10 +100,7 @@ else COPTFLAGS := -O3 endif -ifneq ($(DEBUG_TYPE),noopt) CVECFLAGS := -endif - CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 89756bea1..08a5baaaf 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := -mavx -mfma -march=native +COPTFLAGS := -O2 -fomit-frame-pointer endif +CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 460f53d07..8beaa15fb 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -100,10 +100,7 @@ else COPTFLAGS := -O3 -mtune=power7 endif -ifneq ($(DEBUG_TYPE),noopt) CVECFLAGS := -mvsx -endif - CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 509c4d9e5..b0ac0c62f 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -100,10 +100,7 @@ else COPTFLAGS := -O2 endif -ifneq ($(DEBUG_TYPE),noopt) CVECFLAGS := #-msse3 -march=native # -mfpmath=sse -endif - CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index c1fd57176..4b96d93f7 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -97,13 +97,10 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O3 -march=native -endif - -ifneq ($(DEBUG_TYPE),noopt) -CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse +COPTFLAGS := -O3 endif +CVECFLAGS := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 509c4d9e5..b0ac0c62f 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -100,10 +100,7 @@ else COPTFLAGS := -O2 endif -ifneq ($(DEBUG_TYPE),noopt) CVECFLAGS := #-msse3 -march=native # -mfpmath=sse -endif - CKOPTFLAGS := $(COPTFLAGS) # Aggregate all of the flags into multiple groups: one for standard diff --git a/configure b/configure index 4528a826e..63303434e 100755 --- a/configure +++ b/configure @@ -206,6 +206,7 @@ main() ;; enable-debug) debug_flag=1 + debug_type=noopt ;; enable-debug=*) debug_flag=1 @@ -327,7 +328,7 @@ main() # Check if the debug flag was specified. if [ -n "${debug_flag}" ]; then - if [ ${debug_type} = 'opt' ]; then + if [ "x${debug_type}" = "xopt" ]; then echo "${script_name}: enabling debug symbols with optimizations." else debug_type='noopt' From 9452bdb3afbf2d7f898134a091d7790817e7be9c Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 25 Mar 2016 14:59:50 -0500 Subject: [PATCH 05/10] Add options for verbose make output and static/shared linking to configure. --- build/config.mk.in | 4 ++++ configure | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/build/config.mk.in b/build/config.mk.in index db63e517f..2af506740 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -57,6 +57,10 @@ CC_VENDOR := @cc_vendor@ # may install to a temporary location. INSTALL_PREFIX := $(DESTDIR)@install_prefix@ +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@ +BLIS_ENABLE_STATIC_BUILD := @enable_static@ +BLIS_ENABLE_DYNAMIC_BUILD := @enable_dynamic@ # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/configure b/configure index 4528a826e..95466771e 100755 --- a/configure +++ b/configure @@ -73,6 +73,21 @@ print_usage() echo " kept in the framework, otherwise optimization is" echo " turned off." echo " " + echo " --enable-verbose-make, --disable-verbose-make" + echo " " + echo " Enable (disabled by default) verbose compilation" + echo " output during make." + echo " " + echo " --disable-static, --enable-static" + echo " " + echo " Disable (enabled by default) building BLIS as a static" + echo " library. May be combined with --enable-shared." + echo " " + echo " --enable-shared, --disable-static" + echo " " + echo " Enable (disabled by default) building BLIS as a shared" + echo " library. May be combined with --enable-static." + echo " " echo " -q, --quiet Suppress informational output. By default, configure" echo " is verbose. (NOTE: -q is not yet implemented)" echo " " @@ -85,7 +100,7 @@ print_usage() echo " Environment variables may also be specified as command line" echo " options, e.g.:" echo " " - echo " ./configure CC=gcc sandybridge" + echo " ./configure [options] CC=gcc sandybridge" echo " " echo " Note that not all compilers are compatible with a given" echo " configuration." @@ -166,6 +181,11 @@ main() # Option variables. quiet_flag='' + + # Additional flags. + enable_verbose='yes' + enable_static='yes' + enable_shared='no' # The path to the auto-detection script. auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh" @@ -211,6 +231,24 @@ main() debug_flag=1 debug_type=${OPTARG#*=} ;; + enable-verbose-make) + enable_verbose='yes' + ;; + disable-verbose-make) + enable_verbose='no' + ;; + enable-static) + enable_static='yes' + ;; + disable-static) + enable_static='no' + ;; + enable-shared) + enable_shared='yes' + ;; + disable-shared) + enable_shared='no' + ;; *) print_usage ;; @@ -375,6 +413,9 @@ main() | sed "s/@cc_vendor@/${cc_vendor}/g" \ | sed "s/@debug_type@/${debug_type}/g" \ | sed "s/@install_prefix@/${install_prefix_esc}/g" \ + | sed "s/@enable_verbose@/${enable_verbose}/g" \ + | sed "s/@enable_static@/${enable_static}/g" \ + | sed "s/@enable_shared@/${enable_shared}/g" \ > "${config_mk_out_path}" From 76099f20be1b49ac960f7e3c5a8296bbf4e1782d Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 25 Mar 2016 17:22:58 -0500 Subject: [PATCH 06/10] Add threading option to configure. --- Makefile | 71 +++++++++++++++++++++++++++++++ build/config.mk.in | 17 +++++--- config/armv7a/make_defs.mk | 40 ------------------ config/armv8a/make_defs.mk | 44 +------------------ config/bgq/make_defs.mk | 41 +----------------- config/bulldozer/make_defs.mk | 42 +----------------- config/carrizo/bli_config.h | 3 -- config/carrizo/make_defs.mk | 44 +------------------ config/cortex-a15/make_defs.mk | 40 ------------------ config/cortex-a9/make_defs.mk | 40 ------------------ config/dunnington/make_defs.mk | 42 +----------------- config/emscripten/make_defs.mk | 41 +----------------- config/haswell/bli_config.h | 5 --- config/haswell/make_defs.mk | 44 +------------------ config/loongson3a/make_defs.mk | 42 +----------------- config/mic/bli_config.h | 1 - config/mic/make_defs.mk | 44 +------------------ config/piledriver/bli_config.h | 3 -- config/piledriver/make_defs.mk | 44 +------------------ config/pnacl/make_defs.mk | 41 +----------------- config/power7/make_defs.mk | 42 +----------------- config/reference/make_defs.mk | 42 +----------------- config/sandybridge/bli_config.h | 6 --- config/sandybridge/make_defs.mk | 44 +------------------ config/template/make_defs.mk | 42 +----------------- configure | 75 +++++++++++++++++++++++++++++++-- 26 files changed, 173 insertions(+), 767 deletions(-) diff --git a/Makefile b/Makefile index c27ecd8f0..2b11f40a9 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,28 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME) +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +GREP := grep +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + # # --- Include makefile definitions file ---------------------------------------- # @@ -157,6 +179,55 @@ else MAKE_DEFS_MK_PRESENT := no endif +# Deal with threading flags and aggregate all of the flags into multiple groups: +# one for standard compilation, and one for each of the supported "special" +# compilation modes. + +ifeq ($(CC_VENDOR),gcc) +ifeq ($(THREADING_MODEL),auto) +THREADING_MODEL := omp +endif +ifeq ($(THREADING_MODEL),omp) +CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP +LD_FLAGS += -fopenmp +endif +ifeq ($(THREADING_MODEL),pthreads) +CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS +LD_FLAGS += -pthread +endif +endif + +ifeq ($(CC_VENDOR),icc) +ifeq ($(THREADING_MODEL),auto) +THREADING_MODEL := omp +endif +ifeq ($(THREADING_MODEL),omp) +CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP +LD_FLAGS += -openmp +endif +ifeq ($(THREADING_MODEL),pthreads) +CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS +LD_FLAGS += -pthread +endif +endif + +ifeq ($(CC_VENDOR),clang) +ifeq ($(THREADING_MODEL),auto) +THREADING_MODEL := pthreads +endif +ifeq ($(THREADING_MODEL),omp) +$(error OpenMP is not supported with Clang.) +endif +ifeq ($(THREADING_MODEL),pthreads) +CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS +LD_FLAGS += -pthread +endif +endif + +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) +CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) + # diff --git a/build/config.mk.in b/build/config.mk.in index 2af506740..a043d7aa9 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -37,20 +37,23 @@ ifndef CONFIG_MK_INCLUDED CONFIG_MK_INCLUDED := yes # The name of the configuration sub-directory. -CONFIG_NAME := @config_name@ +CONFIG_NAME := @config_name@ -# The operating system name, which should be either 'Linux' or 'Darwin'. -OS_NAME := $(shell uname -s) +# The operatin g system name, which should be either 'Linux' or 'Darwin'. +OS_NAME := $(shell uname -s) # The directory path to the top level of the source distribution. -DIST_PATH := @dist_path@ +DIST_PATH := @dist_path@ # The level of debugging info to generate. -DEBUG_TYPE := @debug_type@ +DEBUG_TYPE := @debug_type@ # The C compiler. -CC := @CC@ -CC_VENDOR := @cc_vendor@ +CC := @CC@ +CC_VENDOR := @cc_vendor@ + +# The requested threading model. +THREADING_MODEL := @threading_model@ # The install prefix tell us where to install the libraries and header file # directory. Notice that we support the use of DESTDIR so that advanced users diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index ff1d84b0d..40b6c179a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -103,13 +70,6 @@ endif CVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 3cec43c92..654a9ff92 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_GNU_SOURCE -CMISCFLAGS := -std=c99 -fopenmp +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53 CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -fopenmp +LDFLAGS := -lm diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 050f353fb..0f405102b 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # # --- Determine the C compiler and related flags --- CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r +CC_VENDOR := IBM # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L \ @@ -89,13 +57,6 @@ COPTFLAGS := -O3 CKOPTFLAGS := $(COPTFLAGS) CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 772ac1c53..78f47d908 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -fopenmp +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/carrizo/bli_config.h b/config/carrizo/bli_config.h index b0da5de46..86a584112 100644 --- a/config/carrizo/bli_config.h +++ b/config/carrizo/bli_config.h @@ -36,9 +36,6 @@ #define BLIS_CONFIG_H -//#define BLIS_ENABLE_PTHREADS -#define BLIS_ENABLE_OPENMP - #define BLIS_SIMD_ALIGN_SIZE 16 diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 08a5baaaf..4708a8f36 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -fopenmp +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -fopenmp +LDFLAGS := -lm diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index e81c28f60..6f584f14c 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -103,13 +70,6 @@ endif CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index e81c28f60..6f584f14c 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -103,13 +70,6 @@ endif CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 4d06567d0..e67d45e85 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 # -fopenmp -pg +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -msse3 -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 55107d98a..45b210ab6 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := emranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # # --- Determine the C compiler and related flags --- CC := emcc +CC_VENDOR := emcc # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L @@ -88,13 +56,6 @@ COPTFLAGS := -O2 CKOPTFLAGS := -O3 CVECFLAGS := -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := emar ARFLAGS := cru diff --git a/config/haswell/bli_config.h b/config/haswell/bli_config.h index 5f66f6dae..89bba2b20 100644 --- a/config/haswell/bli_config.h +++ b/config/haswell/bli_config.h @@ -35,11 +35,6 @@ #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H -// Enable multithreading via POSIX threads. -//#define BLIS_ENABLE_PTHREADS - -// Enable multithreading via OpenMP. -#define BLIS_ENABLE_OPENMP diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 895746fc5..cbc11f37a 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg +CMISCFLAGS := -std=c99 -m64 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -fopenmp -lpthread +LDFLAGS := -lm diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 8296dcd92..8bb13192c 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64 -CMISCFLAGS := -std=c99 -fopenmp #-pg +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -march=loongson3a #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index a119a2dde..36b14cf4c 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -39,7 +39,6 @@ #define BLIS_TREE_BARRIER #define BLIS_TREE_BARRIER_ARITY 4 -#define BLIS_ENABLE_OPENMP #define BLIS_SIMD_ALIGN_SIZE 32 diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 5e298269d..21af9e2e2 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp +CMISCFLAGS := -mmic -fasm-blocks -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -mmic -lm -openmp +LDFLAGS := -mmic -lm diff --git a/config/piledriver/bli_config.h b/config/piledriver/bli_config.h index dce91516d..38708a0b2 100644 --- a/config/piledriver/bli_config.h +++ b/config/piledriver/bli_config.h @@ -36,9 +36,6 @@ #define BLIS_CONFIG_H -//#define BLIS_ENABLE_PTHREADS - -#define BLIS_ENABLE_OPENMP #define BLIS_SIMD_ALIGN_SIZE 16 diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 08a5baaaf..4708a8f36 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -fopenmp +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -fopenmp +LDFLAGS := -lm diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e1fa986be..e957cf429 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := pnacl-ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # # --- Determine the C compiler and related flags --- CC := pnacl-clang +CC_VENDOR := pnacl-clang # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L @@ -88,13 +56,6 @@ COPTFLAGS := -O3 CKOPTFLAGS := $(COPTFLAGS) -ffast-math CVECFLAGS := -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := pnacl-ar ARFLAGS := rcs diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 8beaa15fb..d03857a44 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg +CMISCFLAGS := -std=c99 -m64 -mcpu=power7 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mvsx CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index b0ac0c62f..b17e3a0ba 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 # -fopenmp -pg +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/sandybridge/bli_config.h b/config/sandybridge/bli_config.h index 5f66f6dae..5b915c737 100644 --- a/config/sandybridge/bli_config.h +++ b/config/sandybridge/bli_config.h @@ -35,12 +35,6 @@ #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H -// Enable multithreading via POSIX threads. -//#define BLIS_ENABLE_PTHREADS - -// Enable multithreading via OpenMP. -#define BLIS_ENABLE_OPENMP - #endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 4b96d93f7..c69387c7b 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg +CMISCFLAGS := -std=c99 -m64 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru @@ -117,7 +77,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -fopenmp -lpthread +LDFLAGS := -lm diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index b0ac0c62f..b17e3a0ba 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes -# -# --- Build definitions -------------------------------------------------------- -# - -# Variables corresponding to other configure-time options. -BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no -BLIS_ENABLE_STATIC_BUILD := yes -BLIS_ENABLE_DYNAMIC_BUILD := no - - - -# -# --- Utility program definitions ---------------------------------------------- -# - -SH := /bin/sh -MV := mv -MKDIR := mkdir -p -RM_F := rm -f -RM_RF := rm -rf -SYMLINK := ln -sf -FIND := find -GREP := grep -XARGS := xargs -RANLIB := ranlib -INSTALL := install -c - -# Used to refresh CHANGELOG. -GIT := git -GIT_LOG := $(GIT) log --decorate - - - # # --- Development tools definitions -------------------------------------------- # @@ -86,7 +53,7 @@ endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 # -fopenmp -pg +CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall @@ -103,13 +70,6 @@ endif CVECFLAGS := #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) -# Aggregate all of the flags into multiple groups: one for standard -# compilation, and one for each of the supported "special" compilation -# modes. -CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) -CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) -CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) - # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/configure b/configure index 5eab11149..325120c32 100755 --- a/configure +++ b/configure @@ -88,6 +88,14 @@ print_usage() echo " Enable (disabled by default) building BLIS as a shared" echo " library. May be combined with --enable-static." echo " " + echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" + echo " " + echo " Enable threading in the library, using threading model" + echo " MODEL={auto,omp,pthreads,no}. If MODEL=no or " + echo " --disable-threading is specified, threading will be" + echo " disabled. If MODEL=auto or is unspecified, a model" + echo " will be chosen automatically. The default is 'auto'." + echo " " echo " -q, --quiet Suppress informational output. By default, configure" echo " is verbose. (NOTE: -q is not yet implemented)" echo " " @@ -179,11 +187,14 @@ main() debug_type='' debug_flag='' + # The threading flag. + threading_model='auto' + # Option variables. quiet_flag='' # Additional flags. - enable_verbose='yes' + enable_verbose='no' enable_static='yes' enable_shared='no' @@ -210,7 +221,7 @@ main() # Process our command line options. - while getopts ":hp:d:q-:" opt; do + while getopts ":hp:d:t:q-:" opt; do case $opt in -) case "$OPTARG" in @@ -232,6 +243,9 @@ main() debug_flag=1 debug_type=${OPTARG#*=} ;; + disable-debug) + debug_flag=0 + ;; enable-verbose-make) enable_verbose='yes' ;; @@ -250,6 +264,15 @@ main() disable-shared) enable_shared='no' ;; + enable-threading) + threading_model='auto' + ;; + enable-threading=*) + threading_model=${OPTARG#*=} + ;; + disable-threading) + threading_model='no' + ;; *) print_usage ;; @@ -268,6 +291,9 @@ main() q) quiet_flag=1 ;; + t) + threading_model=$OPTARG + ;; \?) print_usage ;; @@ -376,6 +402,46 @@ main() debug_type='off' echo "${script_name}: debug symbols disabled." fi + + + # Check if the verbose make flag was specified. + if [ "x${enable_verbose}" = "xyes" ]; then + echo "${script_name}: enabling verbose make output, disable with 'make V=0'." + else + echo "${script_name}: disabling verbose make output, enable with 'make V=1'." + fi + + + # Check if the static lib flag was specified. + if [ "x${enable_static}" = "xyes" ]; then + echo "${script_name}: building BLIS as a static library." + fi + + # Check if the shared lib flag was specified. + if [ "x${enable_shared}" = "xyes" ]; then + echo "${script_name}: building BLIS as a shared library." + fi + + # Check if neither flag was specified. + if [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xno" ]; then + echo "Neither a shared nor static library build has been requested." + exit 1 + fi + + + # Check the threading model flag. + if [ "x${threading_model}" = "xauto" ]; then + echo "${script_name}: determining the threading model automatically." + elif [ "x${threading_model}" = "xomp" ]; then + echo "${script_name}: using OpenMP for threading." + elif [ "x${threading_model}" = "xpthreads" ]; then + echo "${script_name}: using Pthreads for threading." + elif [ "x${threading_model}" = "xno" ]; then + echo "${script_name}: threading is disabled." + else + echo "Unsupported threading model: ${threading_model}." + exit 1 + fi # Determine the compiler vendor if CC was specified. @@ -389,7 +455,7 @@ main() cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'` fi if [ -z "$cc_vendor" ]; then - echo Unable to determine compiler vendor. + echo "Unable to determine compiler vendor." exit 1 fi cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }` @@ -416,7 +482,8 @@ main() | sed "s/@install_prefix@/${install_prefix_esc}/g" \ | sed "s/@enable_verbose@/${enable_verbose}/g" \ | sed "s/@enable_static@/${enable_static}/g" \ - | sed "s/@enable_shared@/${enable_shared}/g" \ + | sed "s/@enable_dynamic@/${enable_shared}/g" \ + | sed "s/@threading_model@/${threading_model}/g" \ > "${config_mk_out_path}" From 8442d65c9ead0376fc5f2dfad62fd4862ab9b2b3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 25 Mar 2016 20:06:48 -0500 Subject: [PATCH 07/10] Replace -march=native with specific architecture flags to support cross-compiling, and add icc support for Intel architectures. --- config/carrizo/make_defs.mk | 2 +- config/cortex-a15/make_defs.mk | 2 +- config/cortex-a9/make_defs.mk | 2 +- config/dunnington/make_defs.mk | 15 +++++++++++---- config/haswell/make_defs.mk | 15 +++++++++++---- config/loongson3a/make_defs.mk | 2 +- config/piledriver/make_defs.mk | 2 +- config/reference/make_defs.mk | 2 +- config/sandybridge/make_defs.mk | 15 +++++++++++---- config/template/make_defs.mk | 2 +- 10 files changed, 40 insertions(+), 19 deletions(-) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 4708a8f36..aaecb2d2c 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 -fomit-frame-pointer endif -CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse +CVECFLAGS := -mavx -mfma -march=bdver4 -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 6f584f14c..ec5360da4 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 endif -CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse +CVECFLAGS := -march=armv7-a CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 6f584f14c..ec5360da4 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 endif -CVECFLAGS := -march=armv7-a #-msse3 -march=native # -mfpmath=sse +CVECFLAGS := -march=armv7-a CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index e67d45e85..8448b723f 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -47,9 +47,7 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) -endif + # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L @@ -67,9 +65,18 @@ else COPTFLAGS := -O2 -fomit-frame-pointer endif -CVECFLAGS := -msse3 -march=native -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CVECFLAGS := -msse3 -march=nehalem -mfpmath=sse +else +ifeq ($(CC_VENDOR),icc) +CVECFLAGS := -xSSE4.2 +else +$(error gcc or icc is required for this configuration.) +endif +endif + # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index cbc11f37a..cb0fe5c11 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -47,9 +47,7 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) -endif + # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L @@ -67,9 +65,18 @@ else COPTFLAGS := -O3 endif -CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell +else +ifeq ($(CC_VENDOR),icc) +CVECFLAGS := -xCORE-AVX2 +else +$(error gcc or icc is required for this configuration.) +endif +endif + # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 8bb13192c..bb1248d37 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O3 -mtune=loongson3a endif -CVECFLAGS := -march=loongson3a #-msse3 -march=native # -mfpmath=sse +CVECFLAGS := -march=loongson3a CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 4708a8f36..e241789dd 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 -fomit-frame-pointer endif -CVECFLAGS := -mavx -mfma -march=native -mfpmath=sse +CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index b17e3a0ba..736e5ee4d 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 endif -CVECFLAGS := #-msse3 -march=native # -mfpmath=sse +CVECFLAGS := CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index c69387c7b..9f6c4366a 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -47,9 +47,7 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) -endif + # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L @@ -67,9 +65,18 @@ else COPTFLAGS := -O3 endif -CVECFLAGS := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CVECFLAGS := -mavx -mfpmath=sse -march=sandybridge +else +ifeq ($(CC_VENDOR),icc) +CVECFLAGS := -xAVX +else +$(error gcc or icc is required for this configuration.) +endif +endif + # --- Determine the archiver and related flags --- AR := ar ARFLAGS := cru diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index b17e3a0ba..37de32882 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -67,7 +67,7 @@ else COPTFLAGS := -O2 endif -CVECFLAGS := #-msse3 -march=native # -mfpmath=sse +CVECFLAGS := #-msse3 -march=core2 # -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- From 469429ec34e5b1a172ce35596f9c7afdaacac131 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 25 Mar 2016 20:45:41 -0500 Subject: [PATCH 08/10] Fix LD_FLAGS -> LDFLAGS. --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2b11f40a9..1f5ae5df7 100644 --- a/Makefile +++ b/Makefile @@ -189,11 +189,11 @@ THREADING_MODEL := omp endif ifeq ($(THREADING_MODEL),omp) CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP -LD_FLAGS += -fopenmp +LDFLAGS += -fopenmp endif ifeq ($(THREADING_MODEL),pthreads) CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS -LD_FLAGS += -pthread +LDFLAGS += -pthread endif endif @@ -203,11 +203,11 @@ THREADING_MODEL := omp endif ifeq ($(THREADING_MODEL),omp) CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP -LD_FLAGS += -openmp +LDFLAGS += -openmp endif ifeq ($(THREADING_MODEL),pthreads) CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS -LD_FLAGS += -pthread +LDFLAGS += -pthread endif endif @@ -220,7 +220,7 @@ $(error OpenMP is not supported with Clang.) endif ifeq ($(THREADING_MODEL),pthreads) CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS -LD_FLAGS += -pthread +LDFLAGS += -pthread endif endif From 0171ad58997b3a5a9b76301511dbe0751fffc940 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 28 Mar 2016 13:55:06 -0500 Subject: [PATCH 09/10] Add icc and clang support for Intel architectures, fixes #47. 2bd036f fixes #49 BTW. --- Makefile | 34 ++++++++++++++++++++++++++++++--- build/config.mk.in | 1 - config/dunnington/make_defs.mk | 8 ++++++-- config/haswell/make_defs.mk | 8 ++++++-- config/sandybridge/make_defs.mk | 8 ++++++-- configure | 19 ------------------ 6 files changed, 49 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 1f5ae5df7..6a4cc637a 100644 --- a/Makefile +++ b/Makefile @@ -160,6 +160,29 @@ GIT_LOG := $(GIT) log --decorate +# +# --- Determine the compiler vendor -------------------------------------------- +# + +ifneq ($(CC),) + +VENDOR_STRING := $(shell $(CC) --version 2>/dev/null) +ifeq ($(VENDOR_STRING),) +VENDOR_STRING := $(shell $(CC) -qversion 2>/dev/null) +endif +ifeq ($(VENDOR_STRING),) +$(error Unable to determine compiler vendor.) +endif + +CC_VENDOR := $(firstword $(shell echo '$(VENDOR_STRING)' | grep -Eo 'icc|gcc|clang|emcc|pnacl|IBM')) +ifeq ($(CC_VENDOR),) +$(error Unable to determine compiler vendor.) +endif + +endif + + + # # --- Include makefile definitions file ---------------------------------------- # @@ -179,9 +202,11 @@ else MAKE_DEFS_MK_PRESENT := no endif -# Deal with threading flags and aggregate all of the flags into multiple groups: -# one for standard compilation, and one for each of the supported "special" -# compilation modes. + + +# +# --- Configuration-agnostic flags --------------------------------------------- +# ifeq ($(CC_VENDOR),gcc) ifeq ($(THREADING_MODEL),auto) @@ -224,6 +249,9 @@ LDFLAGS += -pthread endif endif +# Aggregate all of the flags into multiple groups: one for standard compilation, +# and one for each of the supported "special" compilation modes. + CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT) diff --git a/build/config.mk.in b/build/config.mk.in index a043d7aa9..8bdb427a0 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -50,7 +50,6 @@ DEBUG_TYPE := @debug_type@ # The C compiler. CC := @CC@ -CC_VENDOR := @cc_vendor@ # The requested threading model. THREADING_MODEL := @threading_model@ diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 8448b723f..fed36506b 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -68,12 +68,16 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CVECFLAGS := -msse3 -march=nehalem -mfpmath=sse +CVECFLAGS := -msse3 -march=corei7 -mfpmath=sse else ifeq ($(CC_VENDOR),icc) CVECFLAGS := -xSSE4.2 else -$(error gcc or icc is required for this configuration.) +ifeq ($(CC_VENDOR),clang) +CVECFLAGS := -msse3 -mfpmath=sse -march=corei7 +else +$(error gcc, icc, or clang is required for this configuration.) +endif endif endif diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index cb0fe5c11..1640a40b9 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -68,12 +68,16 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell +CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 else ifeq ($(CC_VENDOR),icc) CVECFLAGS := -xCORE-AVX2 else -$(error gcc or icc is required for this configuration.) +ifeq ($(CC_VENDOR),clang) +CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 +else +$(error gcc, icc, or clang is required for this configuration.) +endif endif endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 9f6c4366a..082a73f92 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -68,12 +68,16 @@ endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CVECFLAGS := -mavx -mfpmath=sse -march=sandybridge +CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx else ifeq ($(CC_VENDOR),icc) CVECFLAGS := -xAVX else -$(error gcc or icc is required for this configuration.) +ifeq ($(CC_VENDOR),clang) +CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx +else +$(error gcc, icc, or clang is required for this configuration.) +endif endif endif diff --git a/configure b/configure index 325120c32..20c4271ea 100755 --- a/configure +++ b/configure @@ -442,24 +442,6 @@ main() echo "Unsupported threading model: ${threading_model}." exit 1 fi - - - # Determine the compiler vendor if CC was specified. - if [ -n "$CC" ]; then - if $CC --version 2>/dev/null | grep -q 'pnacl-version'; then - cc_vendor='pnacl-clang' - else - cc_vendor=`$CC --version 2>/dev/null | grep -Eo 'icc|gcc|clang|emcc'` - fi - if [ -z "$cc_vendor" ]; then - cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'` - fi - if [ -z "$cc_vendor" ]; then - echo "Unable to determine compiler vendor." - exit 1 - fi - cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }` - fi # Insert escape characters into the paths used in the sed command below. @@ -477,7 +459,6 @@ main() | sed "s/@config_name@/${config_name}/g" \ | sed "s/@dist_path@/${dist_path_esc}/g" \ | sed "s/@CC@/${cc_esc}/g" \ - | sed "s/@cc_vendor@/${cc_vendor}/g" \ | sed "s/@debug_type@/${debug_type}/g" \ | sed "s/@install_prefix@/${install_prefix_esc}/g" \ | sed "s/@enable_verbose@/${enable_verbose}/g" \ From 1b09e343dfe5b48b4842e2cb96f41c8cc249bad0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 29 Mar 2016 12:55:28 -0500 Subject: [PATCH 10/10] Updated gcc version from 4.8 to 4.9 in .travis.yml. --- .travis.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index bfe7412a5..71875d79c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,20 +15,19 @@ env: - RUN_TEST=0 BUILD_CONFIG="carrizo" install: -- if [ "$CC" = "gcc" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi +- if [ "$CC" = "gcc" ]; then export CC="gcc-4.9"; fi addons: apt: sources: - ubuntu-toolchain-r-test packages: - - gcc-4.8 - - g++-4.8 + - gcc-4.9 - clang script: - ./configure $BUILD_CONFIG - - make CC=gcc-4.8 + - make CC=gcc-4.9 - if [ $RUN_TEST -eq 1 ]; then make BLIS_ENABLE_TEST_OUTPUT=yes test; fi - - if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi \ No newline at end of file + - if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi