From 08666eaa20d8a31f2f92f944e5bfa7c1558c53e4 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 22 Jul 2016 11:07:34 -0500 Subject: [PATCH 1/7] Change -openmp to -fopenmp for icc. --- common.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common.mk b/common.mk index dccc450cf..458cdcc03 100644 --- a/common.mk +++ b/common.mk @@ -170,8 +170,8 @@ ifeq ($(THREADING_MODEL),auto) THREADING_MODEL := omp endif ifeq ($(THREADING_MODEL),omp) -CTHREADFLAGS := -openmp -LDFLAGS += -openmp +CTHREADFLAGS := -fopenmp +LDFLAGS += -fopenmp endif ifeq ($(THREADING_MODEL),pthreads) CTHREADFLAGS := -pthread From ee2c139df6ad53c6aec8a67ab23b3b1912e8d259 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 22 Jul 2016 12:06:03 -0500 Subject: [PATCH 2/7] Remove alignment restrictions on C in haswell kernel. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 114 +++++++------------ 1 file changed, 41 insertions(+), 73 deletions(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 1fb390a6a..daf625f0d 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -935,23 +935,6 @@ void bli_dgemm_asm_6x8 //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*rs_c % 32 == 0, AND - " \n\t" // cs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // row-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -959,10 +942,8 @@ void bli_dgemm_asm_6x8 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORED \n\t" // jump to row storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -1050,63 +1031,51 @@ void bli_dgemm_asm_6x8 ".DROWSTORED: \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm7, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm9, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm13, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm15, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" @@ -1116,10 +1085,9 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORBZ \n\t" // jump to row storage case + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -1195,38 +1163,38 @@ void bli_dgemm_asm_6x8 ".DROWSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm5, (%%rdx) \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" - "vmovaps %%ymm6, (%%rcx) \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm7, (%%rdx) \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm9, (%%rdx) \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm11, (%%rdx) \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm13, (%%rdx) \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm15, (%%rdx) \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" From e0d2fa0d835ab49366aeb790363bb2b571d36ed8 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 22 Jul 2016 12:56:51 -0500 Subject: [PATCH 3/7] Relax alignment restrictions for haswell sgemm. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 114 +++++++------------ 1 file changed, 41 insertions(+), 73 deletions(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index daf625f0d..460a0e270 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -322,23 +322,6 @@ void bli_sgemm_asm_6x16 "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 4*rs_c % 32 == 0, AND - " \n\t" // cs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // row-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*rs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -346,10 +329,8 @@ void bli_sgemm_asm_6x16 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SROWSTORED \n\t" // jump to row storage case + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -439,63 +420,51 @@ void bli_sgemm_asm_6x16 ".SROWSTORED: \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm7, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm9, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm13, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" + "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213ps %%ymm15, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" + "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" @@ -505,10 +474,9 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SROWSTORBZ \n\t" // jump to row storage case + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" @@ -586,38 +554,38 @@ void bli_sgemm_asm_6x16 ".SROWSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm5, (%%rdx) \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" - "vmovaps %%ymm6, (%%rcx) \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm7, (%%rdx) \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm9, (%%rdx) \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm11, (%%rdx) \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm13, (%%rdx) \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm15, (%%rdx) \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" From 08f1d6b6fa344275de0f675f69737145ccf6646a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 22 Jul 2016 13:44:37 -0500 Subject: [PATCH 4/7] Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. --- kernels/armv8a/3/bli_gemm_opt_4x4.c | 8 ++++---- kernels/loongson3a/3/bli_gemm_opt_d4x4.c | 4 ++-- kernels/mic/3/bli_dgemm_opt_30x8.c | 8 +++++--- kernels/mic/3/bli_sgemm_opt_30x16.c | 8 +++++--- .../x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c | 4 ++-- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 4 ++-- kernels/x86_64/penryn/3/bli_gemm_asm_d4x4.c | 8 ++++---- .../x86_64/penryn/3/bli_gemmtrsm_l_asm_d4x4.c | 4 ++-- .../x86_64/penryn/3/bli_gemmtrsm_u_asm_d4x4.c | 4 ++-- kernels/x86_64/piledriver/3/bli_gemm_asm_d8x3.c | 4 ++-- kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c | 16 ++++++++-------- 11 files changed, 38 insertions(+), 34 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_opt_4x4.c b/kernels/armv8a/3/bli_gemm_opt_4x4.c index e010d188f..6199e461c 100644 --- a/kernels/armv8a/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/3/bli_gemm_opt_4x4.c @@ -63,8 +63,8 @@ void bli_sgemm_opt_8x12( void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( @@ -1112,8 +1112,8 @@ void bli_dgemm_opt_6x8( void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/loongson3a/3/bli_gemm_opt_d4x4.c b/kernels/loongson3a/3/bli_gemm_opt_d4x4.c index a7834c18c..cf4f8e01f 100644 --- a/kernels/loongson3a/3/bli_gemm_opt_d4x4.c +++ b/kernels/loongson3a/3/bli_gemm_opt_d4x4.c @@ -47,8 +47,8 @@ void bli_dgemm_opt_4x4 cntx_t* restrict cntx ) { - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/mic/3/bli_dgemm_opt_30x8.c b/kernels/mic/3/bli_dgemm_opt_30x8.c index 151f56b9a..ab7d9c752 100644 --- a/kernels/mic/3/bli_dgemm_opt_30x8.c +++ b/kernels/mic/3/bli_dgemm_opt_30x8.c @@ -271,6 +271,8 @@ void bli_dgemm_asm_30x8 int * offsetPtr = &offsets[0]; + uint64_t k64 = k; + #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif @@ -288,7 +290,7 @@ void bli_dgemm_asm_30x8 vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers vmovaps zmm2, zmm0 - mov rsi, k //loop index + mov rsi, k64 //loop index vmovaps zmm3, zmm0 mov r11, rs_c //load row stride @@ -312,7 +314,7 @@ void bli_dgemm_asm_30x8 mov rcx, c //load address of c for prefetching vmovaps zmm13, zmm0 vmovaps zmm14, zmm0 - mov r8, k + mov r8, k64 vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 @@ -381,7 +383,7 @@ void bli_dgemm_asm_30x8 //Alternate main loop, with no prefetching of C //Used when <= 40 iterations CONSIDER_UNDER_40: - mov rsi, k + mov rsi, k64 test rsi, rsi je POSTACCUM LOOP_UNDER_40: diff --git a/kernels/mic/3/bli_sgemm_opt_30x16.c b/kernels/mic/3/bli_sgemm_opt_30x16.c index f8eb972bc..8f030653c 100644 --- a/kernels/mic/3/bli_sgemm_opt_30x16.c +++ b/kernels/mic/3/bli_sgemm_opt_30x16.c @@ -271,6 +271,8 @@ void bli_sgemm_asm_30x16 int * offsetPtr = &offsets[0]; + uint64_t k64 = k; + #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif @@ -288,7 +290,7 @@ void bli_sgemm_asm_30x16 vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers vmovaps zmm2, zmm0 - mov rsi, k //loop index + mov rsi, k64 //loop index vmovaps zmm3, zmm0 mov r11, rs_c //load row stride @@ -312,7 +314,7 @@ void bli_sgemm_asm_30x16 mov rcx, c //load address of c for prefetching vmovaps zmm13, zmm0 vmovaps zmm14, zmm0 - mov r8, k + mov r8, k64 vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 @@ -381,7 +383,7 @@ void bli_sgemm_asm_30x16 //Alternate main loop, with no prefetching of C //Used when <= 40 iterations CONSIDER_UNDER_40: - mov rsi, k + mov rsi, k64 test rsi, rsi je POSTACCUM LOOP_UNDER_40: diff --git a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c index dcfc41488..4aad807d2 100644 --- a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c +++ b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c @@ -97,8 +97,8 @@ void bli_sgemm_asm_8x8_fma4 cntx_t* restrict cntx ) { - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 460a0e270..bffde4c5e 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -80,8 +80,8 @@ void bli_sgemm_asm_6x16 //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/x86_64/penryn/3/bli_gemm_asm_d4x4.c b/kernels/x86_64/penryn/3/bli_gemm_asm_d4x4.c index 5eb0a2f3c..d08c1a4b1 100644 --- a/kernels/x86_64/penryn/3/bli_gemm_asm_d4x4.c +++ b/kernels/x86_64/penryn/3/bli_gemm_asm_d4x4.c @@ -49,8 +49,8 @@ void bli_sgemm_asm_8x4 //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( @@ -851,8 +851,8 @@ void bli_dgemm_asm_4x4 void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/x86_64/penryn/3/bli_gemmtrsm_l_asm_d4x4.c b/kernels/x86_64/penryn/3/bli_gemmtrsm_l_asm_d4x4.c index 576f43400..f96d72325 100644 --- a/kernels/x86_64/penryn/3/bli_gemmtrsm_l_asm_d4x4.c +++ b/kernels/x86_64/penryn/3/bli_gemmtrsm_l_asm_d4x4.c @@ -66,8 +66,8 @@ void bli_dgemmtrsm_l_asm_4x4 { void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( diff --git a/kernels/x86_64/penryn/3/bli_gemmtrsm_u_asm_d4x4.c b/kernels/x86_64/penryn/3/bli_gemmtrsm_u_asm_d4x4.c index cf0c5a11d..2cfaec0a3 100644 --- a/kernels/x86_64/penryn/3/bli_gemmtrsm_u_asm_d4x4.c +++ b/kernels/x86_64/penryn/3/bli_gemmtrsm_u_asm_d4x4.c @@ -66,8 +66,8 @@ void bli_dgemmtrsm_u_asm_4x4 { void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile diff --git a/kernels/x86_64/piledriver/3/bli_gemm_asm_d8x3.c b/kernels/x86_64/piledriver/3/bli_gemm_asm_d8x3.c index a4e2b9c58..9d0475191 100644 --- a/kernels/x86_64/piledriver/3/bli_gemm_asm_d8x3.c +++ b/kernels/x86_64/piledriver/3/bli_gemm_asm_d8x3.c @@ -52,8 +52,8 @@ void bli_sgemm_asm_16x3 void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 8; - dim_t k_left = k % 8; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; __asm__ volatile ( diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index 5189403b8..0b017fbcd 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -52,8 +52,8 @@ void bli_sgemm_asm_8x8 //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( @@ -1052,8 +1052,8 @@ void bli_dgemm_asm_8x4 //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( @@ -1739,8 +1739,8 @@ void bli_cgemm_asm_8x4 //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( @@ -2715,8 +2715,8 @@ void bli_zgemm_asm_4x4 //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( From 707a2b7faca137cca7cab7b11a12c44ddaf7ad53 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 22 Jul 2016 13:49:44 -0500 Subject: [PATCH 5/7] Somehow forgot the most important microkernel. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bffde4c5e..bc06c819b 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -661,8 +661,8 @@ void bli_dgemm_asm_6x8 //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 4; - dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; __asm__ volatile ( From a017062fdf763037da9d971a028bb07d47aa1c8a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 22 Jul 2016 17:02:59 -0500 Subject: [PATCH 6/7] Integrated "memory broker" (membrk_t) abstraction. Details: - Integrated a patch originally authored and submitted by Ricardo Magana of HP Enterprise. The changeset inserts use of a new object type, membrk_t, (memory broker) that allows multiple sets of memory pools on, for example, separate NUMA nodes, each of which has a separate memory space. - Added membrk field to cntx_t and defined corresponding accessor macros. - Added membrk field to mem_t object and defined corresponding accessor macros. - Created new bli_membrk.c file, which contains the new memory broker API, including: bli_membrk_init(), bli_membrk_finalize() bli_membrk_acquire_[mv](), bli_membrk_release(), bli_membrk_init_pools(), bli_membrk_reinit_pools(), bli_membrk_finalize_pools(), bli_membrk_pool_size() - In bli_mem.c, changed function calls to bli_mem_init_pools() -> bli_membrk_init() bli_mem_reinit_pools() -> bli_membrk_reinit() bli_mem_finalize_pools() -> bli_membrk_finalize() - In bli_packv_init.c, bli_packm_init.c, changed function calls to: bli_mem_acquire_[mv]() -> bli_membrk_acquire_[mv]() bli_mem_release() -> bli_membrk_release() - Added bli_mutex.c and related files to frame/thread. These files define abstract mutexes (locks) and corresponding APIs for pthreads, openmp, or single-threaded execution. This new API is employed within functions such as bli_membrk_acquire_[mv]() and bli_membrk_release(). --- frame/1/packv/bli_packv_init.c | 34 +- frame/1m/packm/bli_packm_cntx.c | 4 + frame/1m/packm/bli_packm_init.c | 53 +-- frame/base/bli_cntx.h | 40 +- frame/base/bli_mem.c | 495 +----------------------- frame/base/bli_mem.h | 39 +- frame/base/bli_membrk.c | 578 +++++++++++++++++++++++++++++ frame/base/bli_membrk.h | 169 +++++++++ frame/include/bli_mem_macro_defs.h | 15 +- frame/include/bli_obj_macro_defs.h | 5 +- frame/include/bli_type_defs.h | 21 +- frame/include/blis.h | 2 + frame/thread/bli_mutex.h | 49 +++ frame/thread/bli_mutex_openmp.h | 72 ++++ frame/thread/bli_mutex_pthreads.h | 72 ++++ frame/thread/bli_mutex_single.h | 65 ++++ frame/thread/bli_thread.h | 4 + 17 files changed, 1156 insertions(+), 561 deletions(-) create mode 100644 frame/base/bli_membrk.c create mode 100644 frame/base/bli_membrk.h create mode 100644 frame/thread/bli_mutex.h create mode 100644 frame/thread/bli_mutex_openmp.h create mode 100644 frame/thread/bli_mutex_pthreads.h create mode 100644 frame/thread/bli_mutex_single.h diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/packv/bli_packv_init.c index 5d8a10b98..c43931272 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/packv/bli_packv_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -43,6 +44,7 @@ void bli_packv_init ) { // The purpose of packm_init() is to initialize an object P so that + // a source object A can be packed into P via one of the packv // implementations. This initialization includes acquiring a suitable // block of memory from the memory allocator, if such a block of memory @@ -132,15 +134,17 @@ void bli_packv_init_pack cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - dim_t dim_c = bli_obj_vector_dim( *c ); - dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); + num_t dt = bli_obj_datatype( *c ); + dim_t dim_c = bli_obj_vector_dim( *c ); + dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); - mem_t* mem_p; - dim_t m_p_pad; - siz_t size_p; - inc_t rs_p, cs_p; - void* buf; + membrk_t* membrk = bli_cntx_membrk( cntx ); + + mem_t* mem_p; + dim_t m_p_pad; + siz_t size_p; + inc_t rs_p, cs_p; + void* buf; // We begin by copying the basic fields of c. @@ -170,8 +174,9 @@ void bli_packv_init_pack { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } else { @@ -179,10 +184,11 @@ void bli_packv_init_pack // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { - bli_mem_release( mem_p ); + bli_membrk_release( mem_p ); - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } } @@ -218,7 +224,7 @@ void bli_packv_release ) { if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); + bli_obj_release_pack( p ); } diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 787531f41..d42abfd62 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -49,6 +50,9 @@ void bli_packm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SCAL2V_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx ); + + // Initialize the context with the global membrk object. + bli_cntx_set_membrk( bli_mem_global_membrk(), cntx ); } void bli_packm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index cb6f28fe2..c33a0410e 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -202,23 +203,25 @@ void bli_packm_init_pack( invdiag_t invert_diag, obj_t* p, cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - trans_t transc = bli_obj_onlytrans_status( *c ); - dim_t m_c = bli_obj_length( *c ); - dim_t n_c = bli_obj_width( *c ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); + num_t dt = bli_obj_datatype( *c ); + trans_t transc = bli_obj_onlytrans_status( *c ); + dim_t m_c = bli_obj_length( *c ); + dim_t n_c = bli_obj_width( *c ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); + dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); - mem_t* mem_p; - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - void* buf; + membrk_t* membrk = bli_cntx_get_membrk( cntx ); + + mem_t* mem_p; + dim_t m_p, n_p; + dim_t m_p_pad, n_p_pad; + siz_t size_p; + siz_t elem_size_p; + inc_t rs_p, cs_p; + inc_t is_p; + void* buf; // We begin by copying the basic fields of c. We do NOT copy the @@ -549,9 +552,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, { // If the mem_t object of p has not yet been allocated, then acquire // a memory block of type pack_buf_type. - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); + bli_membrk_acquire_m( membrk, + size_p, + pack_buf_type, + mem_p ); } else { @@ -562,10 +566,11 @@ void bli_packm_init_pack( invdiag_t invert_diag, // pack_buf_type value. if ( bli_mem_size( mem_p ) < size_p ) { - bli_mem_release( mem_p ); - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); + bli_membrk_release( mem_p ); + bli_membrk_acquire_m( membrk, + size_p, + pack_buf_type, + mem_p ); } } @@ -582,7 +587,7 @@ void bli_packm_release( obj_t* p, packm_t* cntl ) { if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); + bli_obj_release_pack( p ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 5635ddc88..337d233b3 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -57,6 +58,7 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + membrk_t* membrk; } cntx_t; */ @@ -116,66 +118,75 @@ typedef struct cntx_s \ ( (cntx)->schema_c ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + // cntx_t modification (fields only) #define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \ { \ - (cntx_p)->blkszs = _blkszs; \ + (cntx_p)->blkszs = _blkszs; \ } #define bli_cntx_set_bmults_buf( _bmults, cntx_p ) \ { \ - (cntx_p)->bmults = _bmults; \ + (cntx_p)->bmults = _bmults; \ } #define bli_cntx_set_l3_vir_ukrs_buf( _l3_vir_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ + (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_buf( _l3_nat_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ + (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_prefs_buf( _l3_nat_ukrs_prefs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ + (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ } #define bli_cntx_set_l1f_kers_buf( _l1f_kers, cntx_p ) \ { \ - (cntx_p)->l1f_kers = _l1f_kers; \ + (cntx_p)->l1f_kers = _l1f_kers; \ } #define bli_cntx_set_l1v_kers_buf( _l1v_kers, cntx_p ) \ { \ - (cntx_p)->l1v_kers = _l1v_kers; \ + (cntx_p)->l1v_kers = _l1v_kers; \ } #define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ { \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ + (cntx_p)->packm_ukrs = _packm_ukrs; \ } #define bli_cntx_set_method( _method, cntx_p ) \ { \ - (cntx_p)->method = _method; \ + (cntx_p)->method = _method; \ } #define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a = _schema_a; \ } #define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b = _schema_b; \ } #define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c = _schema_c; \ +} + +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ } // cntx_t query (complex) @@ -264,6 +275,11 @@ typedef struct cntx_s \ bli_cntx_schema_b( cntx ) +#define bli_cntx_get_membrk( cntx ) \ +\ + bli_cntx_membrk( cntx ) + + // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 25530b1ed..83b936aae 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,207 +39,15 @@ pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; #endif -// Declare one memory pool structure for each block size/shape we want to -// be able to allocate. -static pool_t pools[3]; - - +static membrk_t global_membrk; // ----------------------------------------------------------------------------- -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ) +membrk_t* bli_mem_global_membrk( void ) { - pool_t* pool; - pblk_t* pblk; - dim_t pi; - siz_t block_size; - - // Make sure the API is initialized. - bli_mem_init(); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffer requests, such as those used by level-2 - // operations, dynamically allocating memory is sufficient. - void* buf_sys = bli_malloc_pool( req_size ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), and - // - the size of the requested region. - // NOTE: We do not initialize the pool field since this block did not - // come from a memory pool. - bli_mem_set_buffer( buf_sys, mem ); - bli_mem_set_buf_sys( buf_sys, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_size( req_size, mem ); - } - else - { - // This branch handles cases where the memory block needs to come - // from an internal memory pool, in which blocks are allocated once - // and then recycled. - - // Map the requested packed buffer type to a zero-based index, which - // we then use to select the corresponding memory pool. - pi = bli_packbuf_index( buf_type ); - pool = &pools[ pi ]; - - // Unconditionally perform error checking on the memory pool. - { - err_t e_val; - - // Make sure that the requested matrix size fits inside of a block - // of the corresponding pool. If it does not, the pool was somehow - // initialized improperly. - e_val = bli_check_requested_block_size_for_pool( req_size, pool ); - bli_check_error_code( e_val ); - } - - // Extract the address of the pblk_t struct within the mem_t. - pblk = bli_mem_pblk( mem ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - // BEGIN CRITICAL SECTION - { - - // Checkout a block from the pool. If the pool is exhausted, - // either because it is still empty or because all blocks have - // been checked out already, additional blocks will be allocated - // automatically, as-needed. Note that the addresses are stored - // directly into the mem_t struct since pblk is the address of - // the struct's pblk_t field. - bli_pool_checkout_block( pblk, pool ); - - // Query the size of the blocks in the pool so we can store it in - // the mem_t object. At this point, it is guaranteed to be at - // least as large as req_size. (NOTE: We must perform the query - // within the critical section to ensure that the pool hasn't - // changed, as unlikely as that would be.) - block_size = bli_pool_block_size( pool ); - - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Initialize the mem_t object with: - // - the buffer type (a packbuf_t value), - // - the address of the memory pool to which it belongs, and - // - the size of the contiguous memory block (NOT the size of the - // requested region). - // The actual addresses (system and aligned) are already stored in - // the mem_t struct's pblk_t field - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_pool( pool, mem ); - bli_mem_set_size( block_size, mem ); - } + return &global_membrk; } - -void bli_mem_release( mem_t* mem ) -{ - packbuf_t buf_type; - pool_t* pool; - pblk_t* pblk; - siz_t block_size_cur; - siz_t block_size_prev; - - // Make sure the API is initialized. - bli_mem_init(); - - // Extract the buffer type so we know what kind of memory was allocated. - buf_type = bli_mem_buf_type( mem ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - void* buf_sys = bli_mem_buf_sys( mem ); - - // For general-use buffers, we dynamically allocate memory, and so - // here we need to free. - bli_free_pool( buf_sys ); - } - else - { - // Extract the address of the pool from which the memory was - // allocated. - pool = bli_mem_pool( mem ); - - // Extract the address of the pblk_t struct within the mem_t struct. - pblk = bli_mem_pblk( mem ); - - // Query the size of the blocks that were in the pool at the time - // the pblk_t was checked out. (This is used below, in the critical - // section.) - block_size_prev = bli_mem_size( mem ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - - // Query the size of the blocks currently in the pool. - block_size_cur = bli_pool_block_size( pool ); - - // If the block size of the pool has changed since the pblk_t - // was checked out, then we need to free the pblk_t rather - // than check it back in. Why? Because the pool's block size - // has (most likely) increased to meet changing needs (example: - // larger cache blocksizes). Thus, the current pblk_t's smaller - // allocated size is of no use anymore. - if ( block_size_cur != block_size_prev ) - { - // Free the pblk_t using the appropriate function in the - // pool API. - bli_pool_free_block( pblk ); - } - else - { - // Check the block back into the pool. - bli_pool_checkin_block( pblk, pool ); - } - - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - } - - // Clear the mem_t object so that it appears unallocated. This clears: - // - the pblk_t struct's fields (ie: the buffer addresses) - // - the pool field - // - the size field - // NOTE: We do not clear the buf_type field since there is no - // "uninitialized" value for packbuf_t. - bli_mem_clear( mem ); -} - - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ) -{ - bli_mem_acquire_m( req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); -} - - siz_t bli_mem_pool_size( packbuf_t buf_type ) { siz_t r_val; @@ -251,15 +60,15 @@ siz_t bli_mem_pool_size( packbuf_t buf_type ) } else { - dim_t index; + dim_t pool_index; pool_t* pool; // Acquire the pointer to the pool corresponding to the buf_type // provided. - index = bli_packbuf_index( buf_type ); - pool = &(pools[index]); + pool_index = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pool_index, &global_membrk ); - // Compute the pool "size" as the product of the block size + // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. r_val = bli_pool_block_size( pool ) * bli_pool_num_blocks( pool ); @@ -300,8 +109,8 @@ void bli_mem_init( void ) // critical section. if ( bli_mem_is_init == FALSE ) { - // Initialize the memory pools. - bli_mem_init_pools( &cntx ); + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( &cntx, &global_membrk ); // After initialization, mark the API as initialized. bli_mem_is_init = TRUE; @@ -332,16 +141,16 @@ void bli_mem_reinit( cntx_t* cntx ) // initialized (unlikely), we emulate the body of bli_mem_init(). if ( bli_mem_is_init == FALSE ) { - // Initialize the memory pools. - bli_mem_init_pools( cntx ); + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( cntx, &global_membrk ); // After initialization, mark the API as initialized. bli_mem_is_init = TRUE; } else { - // Reinitialize the memory pools. - bli_mem_reinit_pools( cntx ); + // Reinitialize the global membrk_t object's memory pools. + bli_membrk_reinit_pools( cntx, &global_membrk ); } } // END CRITICAL SECTION @@ -373,8 +182,8 @@ void bli_mem_finalize( void ) // critical section. if ( bli_mem_is_init == TRUE ) { - // Finalize the memory pools. - bli_mem_finalize_pools(); + // Finalize the global membrk_t object and its memory pools. + bli_membrk_finalize( &global_membrk ); // After finalization, mark the API as uninitialized. bli_mem_is_init = FALSE; @@ -392,275 +201,3 @@ bool_t bli_mem_is_initialized( void ) return bli_mem_is_init; } -// ----------------------------------------------------------------------------- - -void bli_mem_init_pools( cntx_t* cntx ) -{ - // Map each of the packbuf_t values to an index starting at zero. - const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Start with empty pools. - const dim_t num_blocks_a = 0; - const dim_t num_blocks_b = 0; - const dim_t num_blocks_c = 0; - - siz_t block_size_a = 0; - siz_t block_size_b = 0; - siz_t block_size_c = 0; - - // Determine the block size for each memory pool. - bli_mem_compute_pool_block_sizes( &block_size_a, - &block_size_b, - &block_size_c, - cntx ); - - // Initialize the memory pools for A, B, and C. - bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a ); - bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b ); - bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c ); -} - -void bli_mem_reinit_pools( cntx_t* cntx ) -{ - // Map each of the packbuf_t values to an index starting at zero. - const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Query the number of blocks currently allocated in each pool. - const dim_t num_blocks_a = bli_pool_num_blocks( pool_a ); - const dim_t num_blocks_b = bli_pool_num_blocks( pool_b ); - const dim_t num_blocks_c = bli_pool_num_blocks( pool_c ); - - siz_t block_size_a_new = 0; - siz_t block_size_b_new = 0; - siz_t block_size_c_new = 0; - - // Determine the context-implied block size needed for each pool. - bli_mem_compute_pool_block_sizes( &block_size_a_new, - &block_size_b_new, - &block_size_c_new, - cntx ); - - // Reinitialize the pool, but only if one of the parameters has - // changed in such a way that reinitialization would be required. - // In this case, the align_size is constant, as is num_blocks, so - // what this actually boils down to is that reinitialization of a - // pool occurs only if the block size for that pool has increased. - bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a ); - bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b ); - bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c ); -} - -void bli_mem_finalize_pools( void ) -{ - // Map each of the packbuf_t values to an index starting at zero. - dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; - - // Finalize the memory pools for A, B, and C. - bli_pool_finalize( pool_a ); - bli_pool_finalize( pool_b ); - bli_pool_finalize( pool_c ); -} - -// ----------------------------------------------------------------------------- - -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) -{ - const ind_t im = bli_cntx_get_ind_method( cntx ); - - siz_t bs_cand_a = 0; - siz_t bs_cand_b = 0; - siz_t bs_cand_c = 0; - - num_t dt; - - // Compute pool block sizes for each datatype and find the maximum - // size for each pool. This is done so that new pools do not need - // to be allocated if the user switches datatypes. - for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) - { - siz_t bs_dt_a; - siz_t bs_dt_b; - siz_t bs_dt_c; - - // Avoid considering induced methods for real datatypes. - if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; - - bli_mem_compute_pool_block_sizes_dt( dt, - &bs_dt_a, - &bs_dt_b, - &bs_dt_c, - cntx ); - - bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); - bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); - bs_cand_c = bli_max( bs_dt_c, bs_cand_c ); - } - - // Save the results. - *bs_a = bs_cand_a; - *bs_b = bs_cand_b; - *bs_c = bs_cand_c; -} - -// ----------------------------------------------------------------------------- - -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) -{ - siz_t size_dt = bli_datatype_size( dt ); - - blksz_t* mr; - blksz_t* nr; - - blksz_t* mc; - blksz_t* kc; - blksz_t* nc; - - dim_t mr_dt; - dim_t nr_dt; - dim_t max_mnr_dt; - - dim_t mc_max_dt; - dim_t kc_max_dt; - dim_t nc_max_dt; - - dim_t packmr_dt; - dim_t packnr_dt; - dim_t max_packmnr_dt; - - dim_t scale_num_dt; - dim_t scale_den_dt; - - dim_t pool_mc_dt, left_mc_dt; - dim_t pool_nc_dt, left_nc_dt; - dim_t pool_kc_dt; - - // - // Find the larger of the two register blocksizes. - // - - // Query the mr and nr blksz_t objects for the given method of - // execution. - mr = bli_cntx_get_blksz( BLIS_MR, cntx ); - nr = bli_cntx_get_blksz( BLIS_NR, cntx ); - - // Extract the mr and nr values specific to the current datatype. - mr_dt = bli_blksz_get_def( dt, mr ); - nr_dt = bli_blksz_get_def( dt, nr ); - - // Find the maximum of mr and nr. - max_mnr_dt = bli_max( mr_dt, nr_dt ); - - // - // Define local maximum cache blocksizes. - // - - // Query the mc, kc, and nc blksz_t objects for native execution. - mc = bli_cntx_get_blksz( BLIS_MC, cntx ); - kc = bli_cntx_get_blksz( BLIS_KC, cntx ); - nc = bli_cntx_get_blksz( BLIS_NC, cntx ); - - // Extract the maximum mc, kc, and nc values specific to the current - // datatype. - mc_max_dt = bli_blksz_get_max( dt, mc ); - kc_max_dt = bli_blksz_get_max( dt, kc ); - nc_max_dt = bli_blksz_get_max( dt, nc ); - - // Add max(mr,nr) to kc to make room for the nudging of kc at - // runtime to be a multiple of mr or nr for triangular operations - // trmm, trmm3, and trsm. - kc_max_dt += max_mnr_dt; - - // - // Compute scaling factors. - // - - // Compute integer scaling factors (numerator and denominator) used - // to account for situations when the packing register blocksizes are - // larger than the regular register blocksizes. - - // In order to compute the scaling factors, we first have to determine - // whether ( packmr / mr ) is greater than ( packnr / nr ). This is - // needed ONLY because the amount of space allocated for a block of A - // and a panel of B needs to be such that MR and NR can be swapped (ie: - // A is packed with NR and B is packed with MR). This transformation is - // needed for right-side trsm when inducing an algorithm that (a) has - // favorable access patterns for column-stored C and (b) allows the - // macro-kernel to reuse the existing left-side fused gemmtrsm micro- - // kernels. We avoid integer division by cross-multiplying: - // - // ( packmr / mr ) >= ( packnr / nr ) - // ( packmr / mr ) * nr >= packnr - // packmr * nr >= packnr * mr - // - // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as - // our scaling factors. Otherwise, we'll use packnr and nr. - - packmr_dt = bli_blksz_get_max( dt, mr ); - packnr_dt = bli_blksz_get_max( dt, nr ); - - if ( packmr_dt * nr_dt >= - packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; - scale_den_dt = mr_dt; } - else { scale_num_dt = packnr_dt; - scale_den_dt = nr_dt; } - - // - // Compute pool block dimensions. - // - - pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; - left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; - - pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; - left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; - - pool_kc_dt = ( kc_max_dt ); - - if ( left_mc_dt > 0 ) pool_mc_dt += 1; - if ( left_nc_dt > 0 ) pool_nc_dt += 1; - - // - // Compute pool block sizes - // - - // We add an extra micro-panel of space to the block sizes for A and B - // just to be sure any pre-loading performed by the micro-kernel does - // not cause a segmentation fault. - max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); - - *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; - *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; - *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; -} diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 8d6d71501..9ef741934 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,37 +33,21 @@ */ +#ifndef BLIS_MEM_H +#define BLIS_MEM_H + +// ----------------------------------------------------------------------------- + +membrk_t* bli_mem_global_membrk( void ); +siz_t bli_mem_pool_size( packbuf_t buf_type ); + +// ----------------------------------------------------------------------------- + void bli_mem_init( void ); void bli_mem_reinit( cntx_t* cntx ); void bli_mem_finalize( void ); bool_t bli_mem_is_initialized( void ); -// ----------------------------------------------------------------------------- -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ); - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ); - -void bli_mem_release( mem_t* mem ); - -siz_t bli_mem_pool_size( packbuf_t buf_type ); - -// ----------------------------------------------------------------------------- - -void bli_mem_init_pools( cntx_t* cntx ); -void bli_mem_reinit_pools( cntx_t* cntx ); -void bli_mem_finalize_pools( void ); - -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); +#endif diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c new file mode 100644 index 000000000..33a998de1 --- /dev/null +++ b/frame/base/bli_membrk.c @@ -0,0 +1,578 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + bli_mutex_init( bli_membrk_mutex( membrk ) ); + bli_membrk_init_pools( cntx, membrk ); + bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); +} + +void bli_membrk_finalize + ( + membrk_t* membrk + ) +{ + bli_membrk_set_malloc_fp( NULL, membrk ); + bli_membrk_finalize_pools( membrk ); + bli_mutex_finalize( bli_membrk_mutex( membrk ) ); +} + +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ) +{ + pool_t* pool; + pblk_t* pblk; + dim_t pi; + siz_t block_size; + + // Make sure the API is initialized. + //assert( membrk ); //?? + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffer requests, such as those used by level-2 + // operations, dynamically allocating memory is sufficient. + // Note that we use the malloc()-style memory allocation function + // that is stored in the membrk_t object. + void* buf_sys = bli_membrk_malloc( req_size, membrk ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), + // - the size of the requested region, + // - the membrk_t from which the mem_t entry was acquired. + // NOTE: We do not initialize the pool field since this block did not + // come from a memory pool. + bli_mem_set_buffer( buf_sys, mem ); + bli_mem_set_buf_sys( buf_sys, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_size( req_size, mem ); + bli_mem_set_membrk( membrk, mem ); + } + else + { + // This branch handles cases where the memory block needs to come + // from an internal memory pool, in which blocks are allocated once + // and then recycled. + + // Map the requested packed buffer type to a zero-based index, which + // we then use to select the corresponding memory pool. + pi = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pi, membrk ); + + // Unconditionally perform error checking on the memory pool. + { + err_t e_val; + + // Make sure that the requested matrix size fits inside of a block + // of the corresponding pool. If it does not, the pool was somehow + // initialized improperly. + e_val = bli_check_requested_block_size_for_pool( req_size, pool ); + bli_check_error_code( e_val ); + } + + // Extract the address of the pblk_t struct within the mem_t. + pblk = bli_mem_pblk( mem ); + + // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); + { + + // Checkout a block from the pool. If the pool is exhausted, + // either because it is still empty or because all blocks have + // been checked out already, additional blocks will be allocated + // automatically, as-needed. Note that the addresses are stored + // directly into the mem_t struct since pblk is the address of + // the struct's pblk_t field. + bli_pool_checkout_block( pblk, pool ); + + // Query the size of the blocks in the pool so we can store it in + // the mem_t object. At this point, it is guaranteed to be at + // least as large as req_size. (NOTE: We must perform the query + // within the critical section to ensure that the pool hasn't + // changed, as unlikely as that would be.) + block_size = bli_pool_block_size( pool ); + + } + bli_membrk_unlock( membrk ); + // END CRITICAL SECTION + + // Initialize the mem_t object with: + // - the buffer type (a packbuf_t value), + // - the address of the memory pool to which it belongs, + // - the size of the contiguous memory block (NOT the size of the + // requested region), + // - the membrk_t from which the mem_t entry was acquired. + // The actual addresses (system and aligned) are already stored in + // the mem_t struct's pblk_t field + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_pool( pool, mem ); + bli_mem_set_size( block_size, mem ); + bli_mem_set_membrk( membrk, mem ); + } +} + + +void bli_membrk_release + ( + mem_t* mem + ) +{ + packbuf_t buf_type; + pool_t* pool; + pblk_t* pblk; + siz_t block_size_cur; + siz_t block_size_prev; + membrk_t* membrk; + + // Extract the membrk_t address from the mem_t object. + membrk = bli_mem_membrk( mem ); + + // Extract the buffer type so we know what kind of memory was allocated. + buf_type = bli_mem_buf_type( mem ); + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + void* buf_sys = bli_mem_buf_sys( mem ); + + // For general-use buffers, we dynamically allocate memory, and so + // here we need to free. + // Note that we use the free()-style memory release function that + // is stored in the membrk_t object. + bli_membrk_free( buf_sys, membrk ); + } + else + { + // Extract the address of the pool from which the memory was + // allocated. + pool = bli_mem_pool( mem ); + + // Extract the address of the pblk_t struct within the mem_t struct. + pblk = bli_mem_pblk( mem ); + + // Query the size of the blocks that were in the pool at the time + // the pblk_t was checked out. (This is used below, in the critical + // section.) + block_size_prev = bli_mem_size( mem ); + + // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); + { + + // Query the size of the blocks currently in the pool. + block_size_cur = bli_pool_block_size( pool ); + + // If the block size of the pool has changed since the pblk_t + // was checked out, then we need to free the pblk_t rather + // than check it back in. Why? Because the pool's block size + // has (most likely) increased to meet changing needs (example: + // larger cache blocksizes). Thus, the current pblk_t's smaller + // allocated size is of no use anymore. + if ( block_size_cur != block_size_prev ) + { + // Free the pblk_t using the appropriate function in the + // pool API. + bli_pool_free_block( pblk ); + } + else + { + // Check the block back into the pool. + bli_pool_checkin_block( pblk, pool ); + } + + } + bli_membrk_unlock( membrk ); + // END CRITICAL SECTION + } + + // Clear the mem_t object so that it appears unallocated. This clears: + // - the pblk_t struct's fields (ie: the buffer addresses) + // - the pool field + // - the size field + // - the membrk field + // NOTE: We do not clear the buf_type field since there is no + // "uninitialized" value for packbuf_t. + bli_mem_clear( mem ); +} + + +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ) +{ + bli_membrk_acquire_m( membrk, + req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); +} + + +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ) +{ + siz_t r_val; + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // We don't (yet) track the amount of general-purpose + // memory that is currently allocated. + r_val = 0; + } + else + { + dim_t pool_index; + pool_t* pool; + + // Acquire the pointer to the pool corresponding to the buf_type + // provided. + pool_index = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pool_index, membrk ); + + // Compute the pool "size" as the product of the block size + // and the number of blocks in the pool. + r_val = bli_pool_block_size( pool ) * + bli_pool_num_blocks( pool ); + } + + return r_val; +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Start with empty pools. + const dim_t num_blocks_a = 0; + const dim_t num_blocks_b = 0; + const dim_t num_blocks_c = 0; + + siz_t block_size_a = 0; + siz_t block_size_b = 0; + siz_t block_size_c = 0; + + // Determine the block size for each memory pool. + bli_membrk_compute_pool_block_sizes( &block_size_a, + &block_size_b, + &block_size_c, + cntx ); + + // Initialize the memory pools for A, B, and C. + bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a ); + bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b ); + bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c ); +} + +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Query the number of blocks currently allocated in each pool. + const dim_t num_blocks_a = bli_pool_num_blocks( pool_a ); + const dim_t num_blocks_b = bli_pool_num_blocks( pool_b ); + const dim_t num_blocks_c = bli_pool_num_blocks( pool_c ); + + siz_t block_size_a_new = 0; + siz_t block_size_b_new = 0; + siz_t block_size_c_new = 0; + + // Determine the context-implied block size needed for each pool. + bli_membrk_compute_pool_block_sizes( &block_size_a_new, + &block_size_b_new, + &block_size_c_new, + cntx ); + + // Reinitialize the pool, but only if one of the parameters has + // changed in such a way that reinitialization would be required. + // In this case, the align_size is constant, as is num_blocks, so + // what this actually boils down to is that reinitialization of a + // pool occurs only if the block size for that pool has increased. + bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a ); + bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b ); + bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c ); +} + +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ) +{ + // Map each of the packbuf_t values to an index starting at zero. + dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Alias the pool addresses to convenient identifiers. + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); + + // Finalize the memory pools for A, B, and C. + bli_pool_finalize( pool_a ); + bli_pool_finalize( pool_b ); + bli_pool_finalize( pool_c ); +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) +{ + const ind_t im = bli_cntx_get_ind_method( cntx ); + + siz_t bs_cand_a = 0; + siz_t bs_cand_b = 0; + siz_t bs_cand_c = 0; + + num_t dt; + + // Compute pool block sizes for each datatype and find the maximum + // size for each pool. This is done so that new pools do not need + // to be allocated if the user switches datatypes. + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + siz_t bs_dt_a; + siz_t bs_dt_b; + siz_t bs_dt_c; + + // Avoid considering induced methods for real datatypes. + if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; + + bli_membrk_compute_pool_block_sizes_dt( dt, + &bs_dt_a, + &bs_dt_b, + &bs_dt_c, + cntx ); + + bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); + bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); + bs_cand_c = bli_max( bs_dt_c, bs_cand_c ); + } + + // Save the results. + *bs_a = bs_cand_a; + *bs_b = bs_cand_b; + *bs_c = bs_cand_c; +} + +// ----------------------------------------------------------------------------- + +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) +{ + siz_t size_dt = bli_datatype_size( dt ); + + blksz_t* mr; + blksz_t* nr; + + blksz_t* mc; + blksz_t* kc; + blksz_t* nc; + + dim_t mr_dt; + dim_t nr_dt; + dim_t max_mnr_dt; + + dim_t mc_max_dt; + dim_t kc_max_dt; + dim_t nc_max_dt; + + dim_t packmr_dt; + dim_t packnr_dt; + dim_t max_packmnr_dt; + + dim_t scale_num_dt; + dim_t scale_den_dt; + + dim_t pool_mc_dt, left_mc_dt; + dim_t pool_nc_dt, left_nc_dt; + dim_t pool_kc_dt; + + // + // Find the larger of the two register blocksizes. + // + + // Query the mr and nr blksz_t objects for the given method of + // execution. + mr = bli_cntx_get_blksz( BLIS_MR, cntx ); + nr = bli_cntx_get_blksz( BLIS_NR, cntx ); + + // Extract the mr and nr values specific to the current datatype. + mr_dt = bli_blksz_get_def( dt, mr ); + nr_dt = bli_blksz_get_def( dt, nr ); + + // Find the maximum of mr and nr. + max_mnr_dt = bli_max( mr_dt, nr_dt ); + + // + // Define local maximum cache blocksizes. + // + + // Query the mc, kc, and nc blksz_t objects for native execution. + mc = bli_cntx_get_blksz( BLIS_MC, cntx ); + kc = bli_cntx_get_blksz( BLIS_KC, cntx ); + nc = bli_cntx_get_blksz( BLIS_NC, cntx ); + + // Extract the maximum mc, kc, and nc values specific to the current + // datatype. + mc_max_dt = bli_blksz_get_max( dt, mc ); + kc_max_dt = bli_blksz_get_max( dt, kc ); + nc_max_dt = bli_blksz_get_max( dt, nc ); + + // Add max(mr,nr) to kc to make room for the nudging of kc at + // runtime to be a multiple of mr or nr for triangular operations + // trmm, trmm3, and trsm. + kc_max_dt += max_mnr_dt; + + // + // Compute scaling factors. + // + + // Compute integer scaling factors (numerator and denominator) used + // to account for situations when the packing register blocksizes are + // larger than the regular register blocksizes. + + // In order to compute the scaling factors, we first have to determine + // whether ( packmr / mr ) is greater than ( packnr / nr ). This is + // needed ONLY because the amount of space allocated for a block of A + // and a panel of B needs to be such that MR and NR can be swapped (ie: + // A is packed with NR and B is packed with MR). This transformation is + // needed for right-side trsm when inducing an algorithm that (a) has + // favorable access patterns for column-stored C and (b) allows the + // macro-kernel to reuse the existing left-side fused gemmtrsm micro- + // kernels. We avoid integer division by cross-multiplying: + // + // ( packmr / mr ) >= ( packnr / nr ) + // ( packmr / mr ) * nr >= packnr + // packmr * nr >= packnr * mr + // + // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as + // our scaling factors. Otherwise, we'll use packnr and nr. + + packmr_dt = bli_blksz_get_max( dt, mr ); + packnr_dt = bli_blksz_get_max( dt, nr ); + + if ( packmr_dt * nr_dt >= + packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; + scale_den_dt = mr_dt; } + else { scale_num_dt = packnr_dt; + scale_den_dt = nr_dt; } + + // + // Compute pool block dimensions. + // + + pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; + left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; + left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_kc_dt = ( kc_max_dt ); + + if ( left_mc_dt > 0 ) pool_mc_dt += 1; + if ( left_nc_dt > 0 ) pool_nc_dt += 1; + + // + // Compute pool block sizes + // + + // We add an extra micro-panel of space to the block sizes for A and B + // just to be sure any pre-loading performed by the micro-kernel does + // not cause a segmentation fault. + max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); + + *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; +} diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h new file mode 100644 index 000000000..5db956344 --- /dev/null +++ b/frame/base/bli_membrk.h @@ -0,0 +1,169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MEMBRK_H +#define BLIS_MEMBRK_H + +// -- Memory broker object type -- + +typedef struct membrk_s +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; + +#define bli_membrk_pool( pool_index, membrk_p ) \ +\ + ( (membrk_p)->pools + (pool_index) ) + +#define bli_membrk_mutex( membrk_p ) \ +\ + ( &( (membrk_p)->mutex ) ) + +#define bli_membrk_malloc_fp( membrk_p ) \ +\ + ( (membrk_p)->malloc_fp ) + +#define bli_membrk_free_fp( membrk_p ) \ +\ + ( (membrk_p)->free_fp ) + +#define bli_membrk_set_malloc_fp( _malloc_fp, membrk_p ) \ +{\ + (membrk_p)->malloc_fp = _malloc_fp; \ +} + +#define bli_membrk_set_free_fp( _free_fp, membrk_p ) \ +{\ + (membrk_p)->free_fp = _free_fp; \ +} + +#define bli_membrk_lock( membrk_p ) \ +{\ + bli_mutex_lock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_unlock( membrk_p ) \ +{\ + bli_mutex_unlock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_malloc( size, membrk ) \ +\ + /* Call the malloc()-style function in membrk. */ \ + ((membrk)->malloc_fp)( size ) + +#define bli_membrk_free( buf_p, membrk ) \ +\ + /* Call the free()-style function in membrk. */ \ + ((membrk)->free_fp)( buf_p ) + + +// ----------------------------------------------------------------------------- + +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize + ( + membrk_t* membrk + ); + +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ); + +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ); + +void bli_membrk_release + ( + mem_t* mem + ); + +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ); + +// ---------------------------------------------------------------------------- + +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ); + +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); + +#endif + diff --git a/frame/include/bli_mem_macro_defs.h b/frame/include/bli_mem_macro_defs.h index 51840b712..d0fe850cd 100644 --- a/frame/include/bli_mem_macro_defs.h +++ b/frame/include/bli_mem_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -58,6 +59,10 @@ \ ( (mem_p)->pool ) +#define bli_mem_membrk( mem_p ) \ +\ + ( (mem_p)->membrk ) + #define bli_mem_size( mem_p ) \ \ ( (mem_p)->size ) @@ -90,12 +95,17 @@ #define bli_mem_set_buf_type( buf_type0, mem_p ) \ { \ - mem_p->buf_type = buf_type0; \ + (mem_p)->buf_type = buf_type0; \ } #define bli_mem_set_pool( pool0, mem_p ) \ { \ - mem_p->pool = pool0; \ + (mem_p)->pool = pool0; \ +} + +#define bli_mem_set_membrk( membrk0, mem_p ) \ +{ \ + (mem_p)->membrk = membrk0; \ } #define bli_mem_set_size( size0, mem_p ) \ @@ -109,6 +119,7 @@ bli_mem_set_buf_sys( NULL, mem_p ); \ bli_mem_set_pool( NULL, mem_p ); \ bli_mem_set_size( 0, mem_p ); \ + bli_mem_set_membrk( NULL, mem_p ); \ } diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 30c72e735..306c09544 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -957,14 +958,14 @@ bli_obj_width_stored( obj ) } -// Release object's pack (and cast) memory entries back to memory manager +// Release object's pack mem_t entries back to memory manager #define bli_obj_release_pack( obj_p ) \ { \ mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \ \ if ( bli_mem_is_alloc( pack_mem_ ) ) \ - bli_mem_release( pack_mem_ ); \ + bli_membrk_release( pack_mem_ ); \ } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 7274ce5a6..5f52c89b7 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -173,7 +174,6 @@ typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; - // // -- BLIS info bit field offsets ---------------------------------------------- // @@ -505,6 +505,10 @@ typedef enum // -- BLIS misc. structure types ----------------------------------------------- // +// -- Mutex type -- + +typedef struct mtx_s mtx_t; + // -- Pool block type -- typedef struct @@ -527,6 +531,19 @@ typedef struct siz_t align_size; } pool_t; +// -- Memory broker object type -- + +typedef struct membrk_s membrk_t; +/* +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; +*/ + // -- Memory object type -- typedef struct mem_s @@ -534,6 +551,7 @@ typedef struct mem_s pblk_t pblk; packbuf_t buf_type; pool_t* pool; + membrk_t* membrk; siz_t size; } mem_t; @@ -910,6 +928,7 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + membrk_t* membrk; } cntx_t; diff --git a/frame/include/blis.h b/frame/include/blis.h index 85d7a176d..32fca0c71 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -103,6 +104,7 @@ extern "C" { #include "bli_cntx.h" #include "bli_gks.h" #include "bli_ind.h" +#include "bli_membrk.h" #include "bli_pool.h" #include "bli_mem.h" #include "bli_part.h" diff --git a/frame/thread/bli_mutex.h b/frame/thread/bli_mutex.h new file mode 100644 index 000000000..5ccfebe63 --- /dev/null +++ b/frame/thread/bli_mutex.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_H +#define BLIS_MUTEX_H + +// Include definitions (mostly mtx_t) specific to the method of +// multithreading. +#include "bli_mutex_single.h" +#include "bli_mutex_openmp.h" +#include "bli_mutex_pthreads.h" + +// Thread mutex prototypes. + + +#endif + diff --git a/frame/thread/bli_mutex_openmp.h b/frame/thread/bli_mutex_openmp.h new file mode 100644 index 000000000..4aa82f8ae --- /dev/null +++ b/frame/thread/bli_mutex_openmp.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_OPENMP_H +#define BLIS_MUTEX_OPENMP_H + +// Define mutex_t for situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + omp_lock_t mutex; +} mtx_t; + +// Define macros to operate on OpenMP-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + omp_init_lock( mtx_p ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + omp_destroy_lock( mtx_p ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + omp_set_lock( mtx_p ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + omp_unset_lock( mtx_p ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_pthreads.h b/frame/thread/bli_mutex_pthreads.h new file mode 100644 index 000000000..0ab1876b3 --- /dev/null +++ b/frame/thread/bli_mutex_pthreads.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_PTHREADS_H +#define BLIS_MUTEX_PTHREADS_H + +// Define mutex_t for situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + pthread_mutex_t mutex; +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + pthread_mutex_init( mtx_p ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + pthread_mutex_destroy( mtx_p ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + pthread_mutex_lock( mtx_p ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + pthread_mutex_unlock( mtx_p ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_single.h b/frame/thread/bli_mutex_single.h new file mode 100644 index 000000000..26aefcc21 --- /dev/null +++ b/frame/thread/bli_mutex_single.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_SINGLE_H +#define BLIS_MUTEX_SINGLE_H + +// Define mtx_t for situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +// Define mtx_t. +typedef struct mtx_s +{ +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 6ef2ebb1a..2498baf8c 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -51,6 +52,9 @@ #define BLIS_ENABLE_MULTITHREADING #endif +// Include thread mutex (mtx_t) object definitions and prototypes. +#include "bli_mutex.h" + // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" From f493bf4d704fe0e967783cd6e6877d3302c056a1 Mon Sep 17 00:00:00 2001 From: praveeng Date: Mon, 25 Jul 2016 14:14:00 +0530 Subject: [PATCH 7/7] removed changes from readme file which are giving confilcts Change-Id: Ic71ad1313e1404fed444e899466043704d875af6 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 18c9f84a4..7142a1329 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis) -Introduction.... +Introduction ------------ BLIS is a portable software framework for instantiating high-performance