From 681eec913d7c2ebcff637cec5c1627ced9a92b99 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 12:28:09 -0500 Subject: [PATCH 01/21] Change PACKDIM_MR (double) for haswell to 8. --- config/haswell/bli_kernel.h | 1 + kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index ce18dc266..9ed530d68 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,6 +102,7 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 +#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..3679b5773 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) + "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) + "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From d87614af3f3d9187be94d6e77984b282bf890928 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:47:36 -0400 Subject: [PATCH 02/21] Revert "Change PACKDIM_MR (double) for haswell to 8." This reverts commit 681eec913d7c2ebcff637cec5c1627ced9a92b99. --- config/haswell/bli_kernel.h | 1 - kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 9ed530d68..ce18dc266 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,7 +102,6 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 -#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 3679b5773..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) + "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) + "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From 7f41bb0a0becde6a7de7df0f99668d7b4686c3b0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:49:31 -0400 Subject: [PATCH 03/21] PACKDIM_MR=8 didn't work out, but messing with the prefetching helps 2%. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..5bd2d92e5 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,6 +734,8 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 + "prefetcht0 72 * 8(%%rax) \n\t" + " \n\t" "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" @@ -759,7 +761,7 @@ void bli_dgemm_asm_6x8 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" + "prefetcht0 80 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" From cf54c77bc79a0f33a514be72c80a654c4e6e6f63 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jun 2017 20:23:17 -0500 Subject: [PATCH 04/21] Add new SSI acknowledgment --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c40005221..6b0389bae 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,7 @@ This project and its associated research was partially sponsored by grants from [Microsoft](http://www.microsoft.com/), [Intel](http://www.intel.com/), [Texas Instruments](http://www.ti.com/), and [AMD](http://www.amd.com/), as well as grants from the [National Science Foundation](http://www.nsf.gov/) (Awards -CCF-0917167 ACI-1148125/1340293, and CCF-1320112). +CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493). _Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of From 70cc825b552dec05165b9d70f9e6eb33d8abb118 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jun 2017 21:58:21 -0500 Subject: [PATCH 05/21] Update LICENSE Remove totally unnecessary first 9 lines and hopefully get Github to recognize it as 3BSD [ci skip]. --- LICENSE | 9 --------- 1 file changed, 9 deletions(-) diff --git a/LICENSE b/LICENSE index 38017661d..e3d83cd04 100644 --- a/LICENSE +++ b/LICENSE @@ -1,12 +1,3 @@ - -BLIS framework -License ---- - -The BLIS framework is licensed under the following license, typically -known as the "new" or "modified" or "3-clause" BSD license. - - Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without From ba7cada51a238d320528e3504ed0f0a17a6b022a Mon Sep 17 00:00:00 2001 From: Minh Quan HO Date: Fri, 7 Jul 2017 10:52:05 +0200 Subject: [PATCH 06/21] set missing free_fp in bli_membrk_init for free-ing GEN_USE buffers The membrk's free_fp is called when releasing GEN_USE buffers, but this free_fp is not set in bli_membrk_init --- frame/base/bli_membrk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c index 33a998de1..210c04be1 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_membrk.c @@ -44,6 +44,7 @@ void bli_membrk_init bli_mutex_init( bli_membrk_mutex( membrk ) ); bli_membrk_init_pools( cntx, membrk ); bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); + bli_membrk_set_free_fp( bli_free_pool, membrk ); } void bli_membrk_finalize From 8772a0b33a90154c80d88b381dcdd66f824e041f Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Thu, 13 Jul 2017 21:39:24 -0700 Subject: [PATCH 07/21] Fix Emscripten builds --- config/emscripten/make_defs.mk | 1 + frame/include/bli_system.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 4353d65cf..8797f9332 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -58,6 +58,7 @@ CVECFLAGS := # --- Determine the archiver and related flags --- AR := emar +RANLIB := emranlib ARFLAGS := cru # --- Determine the linker and related flags --- diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 05139136b..99a63d550 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -66,6 +66,8 @@ #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) #define BLIS_OS_BSD 1 +#elif defined(EMSCRIPTEN) +#define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif From 0e58ba1b3aa84700ca51a96f1c0eed6067562fba Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 17 Jul 2017 19:03:22 -0500 Subject: [PATCH 08/21] Added API to set mt environment variables. Details: - Renamed bli_env_get_nway() -> bli_thread_get_env(). - Added bli_thread_set_env() to allow setting environment variables pertaining to multithreading, such as BLIS_JC_NT or BLIS_NUM_THREADS. - Added the following convenience wrapper routines: bli_thread_get_jc_nt() bli_thread_get_ic_nt() bli_thread_get_jr_nt() bli_thread_get_ir_nt() bli_thread_get_num_threads() bli_thread_set_jc_nt() bli_thread_set_ic_nt() bli_thread_set_jr_nt() bli_thread_set_ir_nt() bli_thread_set_num_threads() - Added #include "errno.h" to bli_system.h. - This commit addresses issue #140. - Thanks to Chris Goodyer for inspiring these updates. --- frame/base/bli_cntx.c | 12 ++-- frame/include/bli_system.h | 1 + frame/thread/bli_thread.c | 109 ++++++++++++++++++++++++++++++++++--- frame/thread/bli_thread.h | 24 +++++++- 4 files changed, 130 insertions(+), 16 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 673987bfd..29529924c 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -756,10 +756,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, #ifdef BLIS_ENABLE_MULTITHREADING - int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 ); + int nthread = bli_thread_get_env( "BLIS_NUM_THREADS", -1 ); if ( nthread == -1 ) - nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 ); + nthread = bli_thread_get_env( "OMP_NUM_THREADS", -1 ); if ( nthread < 1 ) nthread = 1; @@ -786,10 +786,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, pc = 1; - dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); - dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); - dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); - dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + dim_t jc_env = bli_thread_get_env( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_thread_get_env( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_thread_get_env( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_thread_get_env( "BLIS_IR_NT", -1 ); if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) { diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 05139136b..b841ff447 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -41,6 +41,7 @@ #include #include #include +#include // Determine if we are on a 64-bit or 32-bit architecture #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 37ec94292..1dde88206 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1156,19 +1156,112 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2, // ----------------------------------------------------------------------------- -// Some utilities -dim_t bli_env_read_nway( const char* env, dim_t fallback ) +dim_t bli_thread_get_env( const char* env, dim_t fallback ) { - dim_t num = fallback; - char* str = getenv( env ); + dim_t r_val; + char* str; + // Query the environment variable and store the result in str. + str = getenv( env ); + + // Set the return value based on the string obtained from getenv(). if ( str != NULL ) - { - num = strtol( str, NULL, 10 ); - } - return num; + { + // If there was no error, convert the string to an integer and + // prepare to return that integer. + r_val = strtol( str, NULL, 10 ); + } + else + { + // If there was an error, use the "fallback" as the return value. + r_val = fallback; + } + + return r_val; } +dim_t bli_thread_get_jc_nt( void ) +{ + return bli_thread_get_env( "BLIS_JC_NT", 1 ); +} + +dim_t bli_thread_get_ic_nt( void ) +{ + return bli_thread_get_env( "BLIS_IC_NT", 1 ); +} + +dim_t bli_thread_get_jr_nt( void ) +{ + return bli_thread_get_env( "BLIS_JR_NT", 1 ); +} + +dim_t bli_thread_get_ir_nt( void ) +{ + return bli_thread_get_env( "BLIS_IR_NT", 1 ); +} + +dim_t bli_thread_get_num_threads( void ) +{ + return bli_thread_get_env( "BLIS_NUM_THREADS", 1 ); +} + +void bli_thread_set_env( const char* env, dim_t value ) +{ + dim_t r_val; + char value_str[32]; + const char* fs_32 = "%u"; + const char* fs_64 = "%lu"; + + // Convert the string to an integer, but vary the format specifier + // depending on the integer type size. + if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value ); + else sprintf( value_str, fs_64, value ); + + // Set the environment variable using the string we just wrote to via + // sprintf(). (The 'TRUE' argument means we want to overwrite the current + // value if the environment variable already exists.) + r_val = setenv( env, value_str, TRUE ); + + // Check the return value in case something went horribly wrong. + if ( r_val == -1 ) + { + char err_str[128]; + + // Query the human-readable error string corresponding to errno. + strerror_r( errno, err_str, 128 ); + + // Print the error message. + bli_print_msg( err_str, __FILE__, __LINE__ ); + } +} + +void bli_thread_set_jc_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JC_NT", value ); +} + +void bli_thread_set_ic_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IC_NT", value ); +} + +void bli_thread_set_jr_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JR_NT", value ); +} + +void bli_thread_set_ir_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IR_NT", value ); +} + +void bli_thread_set_num_threads( dim_t value ) +{ + bli_thread_set_env( "BLIS_NUM_THREADS", value ); +} + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ) { while ( y != 0 ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 1998253cf..9092bc84d 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -164,6 +164,8 @@ void bli_l3_thread_decorator cntl_t* cntl ); +// ----------------------------------------------------------------------------- + // Factorization and partitioning prototypes typedef struct { @@ -178,8 +180,26 @@ dim_t bli_next_prime_factor(bli_prime_factors_t* factors); void bli_partition_2x2(dim_t nthread, dim_t work1, dim_t work2, dim_t* nt1, dim_t* nt2); -// Miscellaneous prototypes -dim_t bli_env_read_nway( const char* env, dim_t fallback ); +// ----------------------------------------------------------------------------- + +dim_t bli_thread_get_env( const char* env, dim_t fallback ); + +dim_t bli_thread_get_jc_nt( void ); +dim_t bli_thread_get_ic_nt( void ); +dim_t bli_thread_get_jr_nt( void ); +dim_t bli_thread_get_ir_nt( void ); +dim_t bli_thread_get_num_threads( void ); + +void bli_thread_set_env( const char* env, dim_t value ); + +void bli_thread_set_jc_nt( dim_t value ); +void bli_thread_set_ic_nt( dim_t value ); +void bli_thread_set_jr_nt( dim_t value ); +void bli_thread_set_ir_nt( dim_t value ); +void bli_thread_set_num_threads( dim_t value ); + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); From 13175c5fb70fb6a378d5fff6ecede62e5ea6a1f6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 18 Jul 2017 17:56:00 -0500 Subject: [PATCH 09/21] Updated openmp/pthread barriers with GNU atomics. Details: - Updated the non-tree openmp and pthreads barriers defined in bli_thrcomm_openmp.c and bli_thrcomm_pthreads.c to instead call a common implementation in bli_thrcomm.c, bli_thrcomm_barrier_atomic(). This new implementation goes through the same motions as the previous codes, but protects its loads and increments with GNU atomic built-ins. These atomic statements take memory ordering parameters that allow us to specify just enough constraints for the barrier to work as intended on weakly-ordered hardware. The prior implementation was only guaranteed to work on systems with strongly- ordered memory. (Thanks to Devin Matthews for suggesting this change and his crash-course in atomics and memory ordering.) - Removed 'volatile' from structs' barrier field declarations in bli_thrcomm_*.h. - Updated bli_thrcomm_pthread.? files to use renamed struct barrier fields consistent with that of the _openmp.? files. - Updated other bli_thrcomm_* files to rename "communicator" variables to simply "comm". --- frame/thread/bli_thrcomm.c | 56 +++++++++++++++++--- frame/thread/bli_thrcomm.h | 12 +++-- frame/thread/bli_thrcomm_openmp.c | 65 ++++++++++++----------- frame/thread/bli_thrcomm_openmp.h | 9 ++-- frame/thread/bli_thrcomm_pthreads.c | 81 +++++++++++++++-------------- frame/thread/bli_thrcomm_pthreads.h | 11 ++-- frame/thread/bli_thrcomm_single.c | 26 ++++----- 7 files changed, 157 insertions(+), 103 deletions(-) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index f45827efd..dac705cfa 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -36,19 +36,63 @@ void* bli_thrcomm_bcast ( - thrcomm_t* communicator, + thrcomm_t* comm, dim_t id, void* to_send ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return to_send; + if ( comm == NULL || comm->n_threads == 1 ) return to_send; - if ( id == 0 ) communicator->sent_object = to_send; + if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( communicator, id ); - void* object = communicator->sent_object; - bli_thrcomm_barrier( communicator, id ); + bli_thrcomm_barrier( comm, id ); + void* object = comm->sent_object; + bli_thrcomm_barrier( comm, id ); return object; } +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) +{ + // Return early if the comm is NULL or if there is only one + // thread participating. + if ( comm == NULL || comm->n_threads == 1 ) return; + + // Read the "sense" variable. This variable is akin to a unique ID for + // the current barrier. The first n-1 threads will spin on this variable + // until it changes. The sense variable gets incremented by the last + // thread to enter the barrier, just before it exits. But it turns out + // that you don't need many unique IDs before you can wrap around. In + // fact, if everything else is working, a binary variable is sufficient, + // which is what we do here (i.e., 0 is incremented to 1, which is then + // decremented back to 0, and so forth). + bool_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED ); + + // Register ourselves (the current thread) as having arrived by + // incrementing the barrier_threads_arrived variable. We must perform + // this increment (and a subsequent read) atomically. + dim_t my_threads_arrived = + __atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL ); + + // If the current thread was the last thread to have arrived, then + // it will take actions that effectively ends and resets the barrier. + if ( my_threads_arrived == comm->n_threads ) + { + // Reset the variable tracking the number of threads that have arrived + // to zero (which returns the barrier to the "empty" state. Then + // atomically toggle the barrier sense variable. This will signal to + // the other threads (which are spinning in the branch elow) that it + // is now safe to exit the barrier. + comm->barrier_threads_arrived = 0; + __atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); + } + else + { + // If the current thread is NOT the last thread to have arrived, then + // it spins on the sense variable until that sense variable changes at + // which time these threads will exit the barrier. + while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense ) + ; // Empty loop body. + } +} + diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 593f8d7fa..59fbc6576 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -49,11 +49,13 @@ // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); -void bli_thrcomm_free( thrcomm_t* communicator ); -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ); -void bli_thrcomm_cleanup( thrcomm_t* communicator ); -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t thread_id ); -void* bli_thrcomm_bcast( thrcomm_t* communicator, dim_t inside_id, void* to_send ); +void bli_thrcomm_free( thrcomm_t* comm ); +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id ); +void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send ); + +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 0882d1659..5777c5b6d 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -44,63 +44,66 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifndef BLIS_TREE_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } //'Normal' barrier for openmp //barrier routine taken from art of multicore programming -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if( communicator == NULL || communicator->n_threads == 1 ) +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; - bool_t my_sense = communicator->barrier_sense; + bool_t my_sense = comm->barrier_sense; dim_t my_threads_arrived; _Pragma( "omp atomic capture" ) - my_threads_arrived = ++(communicator->barrier_threads_arrived); + my_threads_arrived = ++(comm->barrier_threads_arrived); - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->barrier_threads_arrived = 0; - communicator->barrier_sense = !communicator->barrier_sense; + comm->barrier_threads_arrived = 0; + comm->barrier_sense = !comm->barrier_sense; } else { - volatile bool_t* listener = &communicator->barrier_sense; + volatile bool_t* listener = &comm->barrier_sense; while ( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); - bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); + bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } //Tree barrier used for Intel Xeon Phi @@ -145,14 +148,14 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ return me; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - for ( dim_t i = 0; i < communicator->n_threads; i++ ) + if ( comm == NULL ) return; + for ( dim_t i = 0; i < comm->n_threads; i++ ) { - bli_thrcomm_tree_barrier_free( communicator->barriers[i] ); + bli_thrcomm_tree_barrier_free( comm->barriers[i] ); } - bli_free_intl( communicator->barriers ); + bli_free_intl( comm->barriers ); } void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h index 6808b9772..435845b16 100644 --- a/frame/thread/bli_thrcomm_openmp.h +++ b/frame/thread/bli_thrcomm_openmp.h @@ -60,11 +60,12 @@ struct thrcomm_s #else struct thrcomm_s { - void* sent_object; - dim_t n_threads; + void* sent_object; + dim_t n_threads; - volatile bool_t barrier_sense; - dim_t barrier_threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 230b63905..27fb37e6a 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -43,81 +43,84 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifdef BLIS_USE_PTHREAD_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - pthread_barrier_init( &communicator->barrier, NULL, n_threads ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + pthread_barrier_init( &comm->barrier, NULL, n_threads ); } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - pthread_barrier_destroy( &communicator->barrier ); + if ( comm == NULL ) return; + pthread_barrier_destroy( &comm->barrier ); } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - pthread_barrier_wait( &communicator->barrier ); + pthread_barrier_wait( &comm->barrier ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->sense = 0; - communicator->threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_init( &communicator->mutex, NULL ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_init( &comm->mutex, NULL ); +//#endif } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { -#ifdef BLIS_USE_PTHREAD_MUTEX - if ( communicator == NULL ) return; - pthread_mutex_destroy( &communicator->mutex ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// if ( comm == NULL ) return; +// pthread_mutex_destroy( &comm->mutex ); +//#endif } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return; - bool_t my_sense = communicator->sense; +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; + bool_t my_sense = comm->sense; dim_t my_threads_arrived; #ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_lock( &communicator->mutex ); - my_threads_arrived = ++(communicator->threads_arrived); - pthread_mutex_unlock( &communicator->mutex ); + pthread_mutex_lock( &comm->mutex ); + my_threads_arrived = ++(comm->threads_arrived); + pthread_mutex_unlock( &comm->mutex ); #else - my_threads_arrived = __sync_add_and_fetch(&(communicator->threads_arrived), 1); + my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1); #endif - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->threads_arrived = 0; - communicator->sense = !communicator->sense; + comm->threads_arrived = 0; + comm->sense = !comm->sense; } else { - volatile bool_t* listener = &communicator->sense; + volatile bool_t* listener = &comm->sense; while( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 1c807772d..286387bcf 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -54,12 +54,13 @@ struct thrcomm_s void* sent_object; dim_t n_threads; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_t mutex; -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_t mutex; +//#endif - volatile bool_t sense; - volatile dim_t threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index c038f59a0..76b48ca95 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -44,29 +44,29 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { return; } From 5caaba2d61cbbc36d63102a0786ece28ff797f72 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Jul 2017 13:51:53 -0500 Subject: [PATCH 10/21] Added --force-version=STRING option to configure. Details: - Added an option to configure that allows the user to force an arbitrary version string at configure-time. The help text also now describes the usage information. - Changed the way the version string is communicated to the Makefile. Previously, it was read into the VERSION variable from the 'version' file via $(shell cat ...). Now, the VERSION variable is instead set in config.mk (via a configure-substituted anchor from config.mk.in). --- Makefile | 4 ---- build/config.mk.in | 4 ++++ configure | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index d74eba889..6a1bab97f 100644 --- a/Makefile +++ b/Makefile @@ -85,9 +85,6 @@ TESTSUITE_CONF_GEN := input.general TESTSUITE_CONF_OPS := input.operations TESTSUITE_OUT_FILE := output.testsuite -# The name of the file where the version string is stored. -VERSION_FILE := version - # The name of the "special" directories, which contain source code that # use non-standard compiler flags. NOOPT_DIR := noopt @@ -141,7 +138,6 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME) # Construct the architecture-version string, which will be used to name the # library upon installation. -VERSION := $(shell cat $(DIST_PATH)/$(VERSION_FILE)) VERS_CONF := $(VERSION)-$(CONFIG_NAME) # --- Library names --- diff --git a/build/config.mk.in b/build/config.mk.in index e7a3f3235..ef2ccfc70 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -36,6 +36,10 @@ ifndef CONFIG_MK_INCLUDED CONFIG_MK_INCLUDED := yes +# The version string. This could be the official string or a custom +# string forced at configure-time. +VERSION := @version@ + # The name of the configuration sub-directory. CONFIG_NAME := @config_name@ diff --git a/configure b/configure index 7aabc5b78..9edfaa98b 100755 --- a/configure +++ b/configure @@ -123,6 +123,12 @@ print_usage() echo " compatibility layer. This automatically enables the" echo " BLAS compatibility layer as well." echo " " + echo " --force-version=STRING" + echo " " + echo " Force configure to use an arbitrary version string" + echo " STRING. This option may be useful when repackaging" + echo " custom versions of BLIS by outside organizations." + echo " " echo " -h, --help Output this information and quit." echo " " echo " Environment Variables:" @@ -232,6 +238,7 @@ main() blas2blis_int_type_size=32 enable_blas2blis='yes' enable_cblas='no' + force_version='no' # The path to the auto-detection script. auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh" @@ -247,14 +254,6 @@ main() dummy_file='_blis_dir_detect.tmp' - # Check whether we need to update the version file. - ${update_version_file_sh} -o "${script_name}" "${version_filepath}" - - - # Query which version of BLIS this is. - version=$(cat ${version_filepath}) - - # Process our command line options. while getopts ":hp:d:t:qi:b:-:" opt; do case $opt in @@ -323,6 +322,9 @@ main() disable-cblas) enable_cblas='no' ;; + force-version=*) + force_version=${OPTARG#*=} + ;; *) print_usage ;; @@ -375,10 +377,27 @@ main() done + # Check whether we need to update the version file. + ${update_version_file_sh} -o "${script_name}" "${version_filepath}" + + + # Query which version of BLIS this is. + version=$(cat ${version_filepath}) + + # Initial message. echo "${script_name}: starting configuration of BLIS ${version}." + # Check if the user requested a custom version string. + if [ "x${force_version}" = "xno" ]; then + echo "${script_name}: configuring with official version string." + else + echo "${script_name}: configuring with custom version string '${force_version}'." + version="${force_version}" + fi + + # Set config_name based on the number of arguments leftover (after command # line option processing). if [ $# = "0" ]; then @@ -574,6 +593,7 @@ main() # to config_mk_out. echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}" cat "${config_mk_in_path}" \ + | sed "s/@version@/${version}/g" \ | sed "s/@config_name@/${config_name}/g" \ | sed "s/@dist_path@/${dist_path_esc}/g" \ | sed "s/@CC@/${cc_esc}/g" \ From 1f1ec0db9380b87679d5c771c4594daa1cfc5f0d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Jul 2017 15:40:48 -0500 Subject: [PATCH 11/21] Updated ar option list used by all configurations. Details: - Dropped 'u' from the list of modifiers passed into the library archiver ar. Previously, "cru" was used, while now we employ only "cr". This change was prompted by a warning observed on Ubuntu 16.04: ar: `u' modifier ignored since `D' is the default (see `U') This caused me to realize that the default mode causes timestamps to be zero, and thus the 'u' option, which causes only changed object files to be inserted, is not applicable. --- config/armv7a/make_defs.mk | 2 +- config/armv8a/make_defs.mk | 2 +- config/bgq/make_defs.mk | 2 +- config/bulldozer/make_defs.mk | 2 +- config/carrizo/make_defs.mk | 2 +- config/cortex-a15/make_defs.mk | 2 +- config/cortex-a9/make_defs.mk | 2 +- config/dunnington/make_defs.mk | 2 +- config/emscripten/make_defs.mk | 2 +- config/haswell/make_defs.mk | 2 +- config/knl/make_defs.mk | 2 +- config/loongson3a/make_defs.mk | 2 +- config/mic/make_defs.mk | 2 +- config/piledriver/make_defs.mk | 2 +- config/power7/make_defs.mk | 2 +- config/reference/make_defs.mk | 2 +- config/sandybridge/make_defs.mk | 2 +- config/template/make_defs.mk | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 9d1b51d0a..8539e1d29 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 6d09af5cc..56dd3074e 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 57c9899a0..07f6792db 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -59,7 +59,7 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 0546a474f..582354e96 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index f52d1dd67..94808d466 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 8d07f2177..eec2f5a56 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 4353d65cf..814603e0b 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -58,7 +58,7 @@ CVECFLAGS := # --- Determine the archiver and related flags --- AR := emar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 8c739607a..4a4e6e494 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 104abafe2..a3db40981 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -94,7 +94,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 8fd9fb65a..89ca32929 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 8e7738b44..e82811357 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index b5c3f159c..93cd1f2c8 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 765344f79..f35ffdfff 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index f75b9ec55..89bcca269 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -81,7 +81,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index d91df8b68..7bf48d2a4 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index d98452553..e563d9308 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) From 8823f91a14638ce6f4e45e67df03212bb61609d6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 10:04:34 -0500 Subject: [PATCH 12/21] Add fallbacks to __sync_* or __c11_atomic_* builtins when __atomic_* is not supported. Fixes #143. --- frame/thread/bli_thrcomm.c | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index dac705cfa..5e7f21f42 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,6 +52,46 @@ void* bli_thrcomm_bcast return object; } +// Swap out __atomic_* builtins for __sync_* builtins for: +// - BG/Q +// - gcc <4.7 (including icc through gcc compatibility layer) +// - clang without c11 atomic builtins +#if defined(__bgq__) || \ + (defined(__GNUC__) && (__GNUC__ < 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ < 7))) || \ + (defined(__clang__) && !__has_extension(c_atomic)) + +#define __ATOMIC_RELAXED +#define __ATOMIC_ACQUIRE +#define __ATOMIC_RELEASE +#define __ATOMIC_ACQ_REL + +#define __atomic_load_n(ptr, constraint) \ + __sync_fetch_and_add(ptr, 0) +#define __atomic_add_fetch(ptr, value, constraint) \ + __sync_add_and_fetch(ptr, value) +#define __atomic_fetch_add(ptr, value, constraint) \ + __sync_fetch_and_add(ptr, value) +#define __atomic_fetch_xor(ptr, value, constraint) \ + __sync_fetch_and_xor(ptr, value) + +#endif + +// Swap out __atomic_* builtins for _c11_atomic_* builtins for +// - clang with c11 atomic builtins +#if defined(__clang__) && __has_extension(c_atomic) + +#define __atomic_load_n(ptr, constraint) \ + __c11_atomic_load(ptr, constraint) +#define __atomic_add_fetch(ptr, value, constraint) \ + (__c11_fetch_add(ptr, value, constraint) + value) +#define __atomic_fetch_add(ptr, value, constraint) \ + __c11_fetch_add(ptr, value, constraint) +#define __atomic_fetch_xor(ptr, value, constraint) \ + __c11_fetch_xor(ptr, value, constraint) + +#endif + void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) { // Return early if the comm is NULL or if there is only one From 7425d0744d9e9cd29a887120e57c2b43ba287040 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 12:54:58 -0500 Subject: [PATCH 13/21] Add default #define for __has_extension. --- frame/thread/bli_thrcomm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 5e7f21f42..b50218a77 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,6 +52,10 @@ void* bli_thrcomm_bcast return object; } +#ifndef __has_extension +#define __has_extension(x) 0 +#endif + // Swap out __atomic_* builtins for __sync_* builtins for: // - BG/Q // - gcc <4.7 (including icc through gcc compatibility layer) From 733faf848dcc54834fcdfbb0185dc644978d8864 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 14:50:13 -0500 Subject: [PATCH 14/21] Clang can't make up it's mind what to support. --- frame/thread/bli_thrcomm.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index b50218a77..a06f49523 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,18 +52,8 @@ void* bli_thrcomm_bcast return object; } -#ifndef __has_extension -#define __has_extension(x) 0 -#endif - -// Swap out __atomic_* builtins for __sync_* builtins for: -// - BG/Q -// - gcc <4.7 (including icc through gcc compatibility layer) -// - clang without c11 atomic builtins -#if defined(__bgq__) || \ - (defined(__GNUC__) && (__GNUC__ < 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ < 7))) || \ - (defined(__clang__) && !__has_extension(c_atomic)) +// Use __sync_* builtins (assumed available) if __atomic_* ones are not present. +#ifndef __ATOMIC_RELAXED #define __ATOMIC_RELAXED #define __ATOMIC_ACQUIRE @@ -81,21 +71,6 @@ void* bli_thrcomm_bcast #endif -// Swap out __atomic_* builtins for _c11_atomic_* builtins for -// - clang with c11 atomic builtins -#if defined(__clang__) && __has_extension(c_atomic) - -#define __atomic_load_n(ptr, constraint) \ - __c11_atomic_load(ptr, constraint) -#define __atomic_add_fetch(ptr, value, constraint) \ - (__c11_fetch_add(ptr, value, constraint) + value) -#define __atomic_fetch_add(ptr, value, constraint) \ - __c11_fetch_add(ptr, value, constraint) -#define __atomic_fetch_xor(ptr, value, constraint) \ - __c11_fetch_xor(ptr, value, constraint) - -#endif - void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) { // Return early if the comm is NULL or if there is only one From c63980f4ca750618f359031d0691289b1abf5146 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 29 Jul 2017 14:53:39 -0500 Subject: [PATCH 15/21] Moved 'family' field from cntx_t to cntl_t. Details: - Removed the family field inside the cntx_t struct and re-added it to the cntl_t struct. Updated all accessor functions/macros accordingly, as well as all consumers and intermediaries of the family parameter (such as bli_l3_thread_decorator(), bli_l3_direct(), and bli_l3_prune_*()). This change was motivated by the desire to keep the context limited, as much as possible, to information about the computing environment. (The family field, by contrast, is a descriptor about the operation being executed.) - Added additional functions to bli_blksz_*() API. - Added additional functions to bli_cntx_*() API. - Minor updates to bli_func.c, bli_mbool.c. - Removed 'obj' from bli_blksz_*() API names. - Removed 'obj' from bli_cntx_*() API names. - Removed 'obj' from bli_cntl_*(), bli_*_cntl_*() API names. Renamed routines that operate only on a single struct to contain the "_node" suffix to differentiate with those routines that operate on the entire tree. - Added enums for packm and unpackm kernels to bli_type_defs.h. - Removed BLIS_1F and BLIS_VF from bszid_t definition in bli_type_defs.h. They weren't being used and probably never will be. --- frame/1/bli_l1v_cntx.c | 16 +- frame/1/other/packv/bli_packv_cntl.c | 4 +- frame/1/other/scalv/bli_scalv_cntl.c | 2 +- frame/1/other/unpackv/bli_unpackv_cntl.c | 2 +- frame/1d/bli_l1d_cntx.c | 4 +- frame/1f/bli_l1f_cntx.c | 16 +- frame/1m/bli_l1m_cntx.c | 8 +- frame/1m/packm/bli_packm_cntl.c | 5 +- frame/1m/packm/bli_packm_cntl.h | 2 +- frame/1m/packm/bli_packm_cntx.c | 4 +- frame/1m/scalm/bli_scalm_cntl.c | 5 +- frame/1m/scalm/bli_scalm_cntl.h | 2 +- frame/1m/unpackm/bli_unpackm_cntl.c | 5 +- frame/1m/unpackm/bli_unpackm_cntl.h | 2 +- frame/2/bli_l2_cntx.c | 16 +- frame/2/gemv/other/bli_gemv_cntl.c | 16 +- frame/2/ger/other/bli_ger_cntl.c | 16 +- frame/2/hemv/other/bli_hemv_cntl.c | 8 +- frame/2/her/other/bli_her_cntl.c | 8 +- frame/2/her2/other/bli_her2_cntl.c | 8 +- frame/2/trmv/other/bli_trmv_cntl.c | 8 +- frame/2/trsv/other/bli_trsv_cntl.c | 8 +- frame/3/bli_l3_blocksize.c | 5 +- frame/3/bli_l3_blocksize.h | 14 +- frame/3/bli_l3_cntl.c | 11 +- frame/3/bli_l3_cntl.h | 3 +- frame/3/bli_l3_cntx.c | 4 +- frame/3/bli_l3_direct.c | 4 +- frame/3/bli_l3_direct.h | 2 +- frame/3/bli_l3_prune.c | 8 +- frame/3/bli_l3_prune.h | 2 +- frame/3/gemm/bli_gemm_blk_var1.c | 4 +- frame/3/gemm/bli_gemm_blk_var2.c | 4 +- frame/3/gemm/bli_gemm_blk_var3.c | 8 +- frame/3/gemm/bli_gemm_cntl.c | 43 +- frame/3/gemm/bli_gemm_cntl.h | 3 +- frame/3/gemm/bli_gemm_front.c | 4 +- frame/3/hemm/bli_hemm_front.c | 4 +- frame/3/her2k/bli_her2k_front.c | 5 +- frame/3/herk/bli_herk_front.c | 4 +- frame/3/symm/bli_symm_front.c | 4 +- frame/3/syr2k/bli_syr2k_front.c | 5 +- frame/3/syrk/bli_syrk_front.c | 4 +- frame/3/trmm/bli_trmm_front.c | 4 +- frame/3/trmm3/bli_trmm3_front.c | 4 +- frame/3/trsm/bli_trsm_blk_var1.c | 4 +- frame/3/trsm/bli_trsm_blk_var2.c | 4 +- frame/3/trsm/bli_trsm_blk_var3.c | 4 +- frame/3/trsm/bli_trsm_cntl.c | 47 +- frame/3/trsm/bli_trsm_cntl.h | 3 +- frame/3/trsm/bli_trsm_front.c | 4 +- frame/3/trsm/old/bli_trsm_cntl.c | 46 +- frame/3/trsm/old/bli_trsm_cntl.h | 2 +- frame/base/bli_blksz.c | 77 +++- frame/base/bli_blksz.h | 41 +- frame/base/bli_cntl.c | 37 +- frame/base/bli_cntl.h | 23 +- frame/base/bli_cntx.c | 557 +++++++++++++++++------ frame/base/bli_cntx.h | 78 ++-- frame/base/bli_func.c | 62 ++- frame/base/bli_func.h | 31 +- frame/base/bli_gks.c | 6 - frame/base/bli_mbool.c | 39 +- frame/base/bli_mbool.h | 26 +- frame/include/bli_type_defs.h | 85 +++- frame/ind/cntx/bli_gemmind_cntx.c | 16 +- frame/ind/cntx/bli_trsmind_cntx.c | 6 +- frame/thread/bli_thrcomm_openmp.c | 5 +- frame/thread/bli_thrcomm_pthreads.c | 9 +- frame/thread/bli_thrcomm_single.c | 5 +- frame/thread/bli_thread.c | 28 +- frame/thread/bli_thread.h | 1 + testsuite/src/test_libblis.c | 2 +- 73 files changed, 1065 insertions(+), 501 deletions(-) diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index 149c20320..243a3d062 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addv, BLIS_ADDV_KER ) @@ -70,7 +70,7 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -84,7 +84,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) @@ -95,7 +95,7 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -106,7 +106,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyv, BLIS_AXPYV_KER, addv ) @@ -118,7 +118,7 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -130,7 +130,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( scal2v, BLIS_SCAL2V_KER, setv, copyv ) diff --git a/frame/1/other/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c index 13f90a429..b81a6e5d1 100644 --- a/frame/1/other/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -47,7 +47,7 @@ void bli_packv_cntl_init( void ) void bli_packv_cntl_finalize( void ) { - bli_cntl_obj_free( packv_cntl ); + bli_cntl_free_node( packv_cntl ); } packv_t* bli_packv_cntl_obj_create( impl_t impl_type, @@ -105,7 +105,7 @@ cntl_t* bli_packv_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( BLIS_NO_PART, var_func, diff --git a/frame/1/other/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c index 9edb6162c..c75977fa1 100644 --- a/frame/1/other/scalv/bli_scalv_cntl.c +++ b/frame/1/other/scalv/bli_scalv_cntl.c @@ -44,7 +44,7 @@ void bli_scalv_cntl_init() void bli_scalv_cntl_finalize() { - bli_cntl_obj_free( scalv_cntl ); + bli_cntl_free_node( scalv_cntl ); } diff --git a/frame/1/other/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c index 1e1ab93fb..52858fc0b 100644 --- a/frame/1/other/unpackv/bli_unpackv_cntl.c +++ b/frame/1/other/unpackv/bli_unpackv_cntl.c @@ -44,7 +44,7 @@ void bli_unpackv_cntl_init() void bli_unpackv_cntl_finalize() { - bli_cntl_obj_free( unpackv_cntl ); + bli_cntl_free_node( unpackv_cntl ); } unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type, diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index 443dc20f7..f22631a5d 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addd, addv ) diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 58ca4a07c..8e786f2ed 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -54,7 +54,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) @@ -65,7 +65,7 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) @@ -88,7 +88,7 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -105,7 +105,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) @@ -116,7 +116,7 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -135,7 +135,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotxf, BLIS_DOTXF_KER, dotv, dotxv ) diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 7eb3dcd4c..d7ede7c91 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addm, addv ) @@ -66,7 +66,7 @@ GENFRONT( subm, subv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -75,7 +75,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( copym, copyv, setv ) diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 67b01fffb..6effbb522 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, @@ -69,8 +69,9 @@ cntl_t* bli_packm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 057a512ed..ab22e8621 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -80,7 +80,7 @@ typedef struct packm_params_s packm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 2f4e0b030..75fa24d67 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -41,7 +41,7 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { - bli_cntx_obj_create( cntx ); + bli_cntx_create( cntx ); // Initialize the context with kernels that may be needed for the // current operation. @@ -57,5 +57,5 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) void bli_packm_cntx_finalize( cntx_t* cntx ) { - bli_cntx_obj_free( cntx ); + bli_cntx_free( cntx ); } diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index f6008a9a3..24c12bc9e 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node @@ -46,8 +46,9 @@ cntl_t* bli_scalm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, NULL, diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index 4029a4f10..d6160dca8 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -33,7 +33,7 @@ */ -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 2900cb3b8..852b0c81e 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, @@ -55,8 +55,9 @@ cntl_t* bli_unpackm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 82d9727fc..96278d406 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -45,7 +45,7 @@ typedef struct unpackm_params_s unpackm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index fdfe27a85..df6e9441f 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -44,7 +44,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( gemv ) @@ -91,7 +91,7 @@ GENFRONT( trsv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -111,7 +111,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( ger ) @@ -125,7 +125,7 @@ GENFRONT( syr ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -163,7 +163,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( hemv ) @@ -176,7 +176,7 @@ GENFRONT( symv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -198,7 +198,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( her2 ) diff --git a/frame/2/gemv/other/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c index ecedeaca4..4ccba4ff0 100644 --- a/frame/2/gemv/other/bli_gemv_cntl.c +++ b/frame/2/gemv/other/bli_gemv_cntl.c @@ -152,17 +152,17 @@ void bli_gemv_cntl_init() void bli_gemv_cntl_finalize() { - bli_cntl_obj_free( gemv_cntl_bs_ke_dot ); - bli_cntl_obj_free( gemv_cntl_bs_ke_axpy ); + bli_cntl_free_node( gemv_cntl_bs_ke_dot ); + bli_cntl_free_node( gemv_cntl_bs_ke_axpy ); - bli_cntl_obj_free( gemv_cntl_rp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_rp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_rp_bs_dot ); + bli_cntl_free_node( gemv_cntl_rp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_cp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_cp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_cp_bs_dot ); + bli_cntl_free_node( gemv_cntl_cp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_ge_dot ); - bli_cntl_obj_free( gemv_cntl_ge_axpy ); + bli_cntl_free_node( gemv_cntl_ge_dot ); + bli_cntl_free_node( gemv_cntl_ge_axpy ); } diff --git a/frame/2/ger/other/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c index 16565ef02..6e35b5f6f 100644 --- a/frame/2/ger/other/bli_ger_cntl.c +++ b/frame/2/ger/other/bli_ger_cntl.c @@ -145,17 +145,17 @@ void bli_ger_cntl_init() void bli_ger_cntl_finalize() { - bli_cntl_obj_free( ger_cntl_bs_ke_row ); - bli_cntl_obj_free( ger_cntl_bs_ke_col ); + bli_cntl_free_node( ger_cntl_bs_ke_row ); + bli_cntl_free_node( ger_cntl_bs_ke_col ); - bli_cntl_obj_free( ger_cntl_rp_bs_row ); - bli_cntl_obj_free( ger_cntl_rp_bs_col ); + bli_cntl_free_node( ger_cntl_rp_bs_row ); + bli_cntl_free_node( ger_cntl_rp_bs_col ); - bli_cntl_obj_free( ger_cntl_cp_bs_row ); - bli_cntl_obj_free( ger_cntl_cp_bs_col ); + bli_cntl_free_node( ger_cntl_cp_bs_row ); + bli_cntl_free_node( ger_cntl_cp_bs_col ); - bli_cntl_obj_free( ger_cntl_ge_row ); - bli_cntl_obj_free( ger_cntl_ge_col ); + bli_cntl_free_node( ger_cntl_ge_row ); + bli_cntl_free_node( ger_cntl_ge_col ); } diff --git a/frame/2/hemv/other/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c index 8505f615c..4bed7b012 100644 --- a/frame/2/hemv/other/bli_hemv_cntl.c +++ b/frame/2/hemv/other/bli_hemv_cntl.c @@ -108,10 +108,10 @@ void bli_hemv_cntl_init() void bli_hemv_cntl_finalize() { - bli_cntl_obj_free( hemv_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( hemv_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_ge_lcol_urow ); + bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( hemv_cntl_ge_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_ge_lcol_urow ); } diff --git a/frame/2/her/other/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c index 932306c21..28ed63f12 100644 --- a/frame/2/her/other/bli_her_cntl.c +++ b/frame/2/her/other/bli_her_cntl.c @@ -97,10 +97,10 @@ void bli_her_cntl_init() void bli_her_cntl_finalize() { - bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her_cntl_ge_lcol_urow ); + bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her_cntl_ge_lcol_urow ); } diff --git a/frame/2/her2/other/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c index 4a0f5d0f8..199e74c3c 100644 --- a/frame/2/her2/other/bli_her2_cntl.c +++ b/frame/2/her2/other/bli_her2_cntl.c @@ -101,10 +101,10 @@ void bli_her2_cntl_init() void bli_her2_cntl_finalize() { - bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her2_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_ge_lcol_urow ); + bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her2_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her2_cntl_ge_lcol_urow ); } diff --git a/frame/2/trmv/other/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c index 5fbf872aa..fff406365 100644 --- a/frame/2/trmv/other/bli_trmv_cntl.c +++ b/frame/2/trmv/other/bli_trmv_cntl.c @@ -98,10 +98,10 @@ void bli_trmv_cntl_init() void bli_trmv_cntl_finalize() { - bli_cntl_obj_free( trmv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trmv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trmv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_ge_ncol_trow ); } diff --git a/frame/2/trsv/other/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c index 71de48d3c..9eedb5a9f 100644 --- a/frame/2/trsv/other/bli_trsv_cntl.c +++ b/frame/2/trsv/other/bli_trsv_cntl.c @@ -101,10 +101,10 @@ void bli_trsv_cntl_init() void bli_trsv_cntl_finalize() { - bli_cntl_obj_free( trsv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trsv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trsv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_ge_ncol_trow ); } diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 630cf03a5..d25f5f924 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -43,10 +43,11 @@ dim_t bli_l3_determine_kc obj_t* a, obj_t* b, bszid_t bszid, - cntx_t* cntx + cntx_t* cntx, + cntl_t* cntl ) { - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 8f9f7ad80..02250efc0 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -32,6 +32,18 @@ */ +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx, + cntl_t* cntl + ); + #undef GENPROT #define GENPROT( opname ) \ @@ -47,8 +59,6 @@ dim_t PASTEMAC0(opname) \ cntx_t* cntx \ ); -GENPROT( l3_determine_kc ) - GENPROT( gemm_determine_kc ) GENPROT( herk_determine_kc ) GENPROT( trmm_determine_kc ) diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 4fe3fe7f5..db821b811 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -37,10 +37,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ) @@ -49,8 +49,6 @@ void bli_l3_cntl_create_if // tree as a function of the operation family. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); - if ( family == BLIS_GEMM || family == BLIS_HERK || family == BLIS_TRMM ) @@ -73,6 +71,10 @@ void bli_l3_cntl_create_if // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); + + // Recursively set the family fields of the newly copied control tree + // nodes. + bli_cntl_mark_family( family, *cntl_use ); } } @@ -81,7 +83,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread @@ -91,7 +92,7 @@ void bli_l3_cntl_free_if // been created, so we now must free it. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || family == BLIS_HERK || diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index dc0aeb869..3bdd8b43f 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -39,10 +39,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ); @@ -52,7 +52,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 161e68160..a8441fa79 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -41,7 +41,7 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -76,7 +76,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 993501541..f1c661007 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -39,11 +39,11 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7b88ba51f..021dfde74 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -37,7 +37,7 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ); // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index f908bbb64..a14c543d8 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -40,11 +40,11 @@ void bli_l3_prune_unref_mparts_m obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); @@ -61,11 +61,11 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ) \ { \ /* Query the operation family. */ \ - opid_t family = bli_cntx_family( cntx ); \ + opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 13d661ff1..6905e21f4 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ); GENPROT( m ) diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 1a5693d8c..8fc062da2 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index a65f8a20a..ff2a570db 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 0148428df..64ab573da 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -66,7 +66,7 @@ void bli_gemm_blk_var3 { // Determine the current algorithmic blocksize. b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, @@ -109,7 +109,7 @@ void bli_gemm_blk_var3 // row-panel of C, and thus beta is applied to all of C exactly once. // Thus, for neither trmm nor trmm3 should we reset the scalar on C // after the first iteration. - if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( bli_cntl_family( cntl ) != BLIS_TRMM ) if ( i == 0 ) bli_obj_scalar_reset( c ); } } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 775ca2544..b17ce10ac 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -56,22 +56,24 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -86,15 +88,16 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var1, gemm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -109,16 +112,18 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var2, gemm_cntl_mm_op @@ -141,15 +146,17 @@ cntl_t* bli_gemmpb_cntl_create //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_ub_ke @@ -157,7 +164,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix A (which is really the right-hand // operand "B"). - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -172,8 +179,9 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the n dimension by MC. - cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var2, gemm_cntl_packb @@ -181,7 +189,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix B (which is really the left-hand // operand "A"). - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -196,16 +204,18 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packa ); // Create a node for partitioning the m dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var1, gemm_cntl_mm_op @@ -227,13 +237,14 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 6da6cd768..3b643e1fc 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -59,8 +59,9 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index acceabbe8..f737edf81 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -82,9 +82,6 @@ void bli_gemm_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -95,6 +92,7 @@ void bli_gemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 340aa7edc..8d7f8d635 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -89,9 +89,6 @@ void bli_hemm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -102,6 +99,7 @@ void bli_hemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index c6851d2a4..e203d59ba 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -107,9 +107,6 @@ void bli_her2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -122,6 +119,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bh_local, @@ -134,6 +132,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 642be0d99..227b97d5d 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -87,9 +87,6 @@ void bli_herk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -100,6 +97,7 @@ void bli_herk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &ah_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 57aa11f73..a01ed15cf 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -88,9 +88,6 @@ void bli_symm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -101,6 +98,7 @@ void bli_symm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index f64a765e5..459cdbdd0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -88,9 +88,6 @@ void bli_syr2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -103,6 +100,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bt_local, @@ -115,6 +113,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &b_local, &at_local, diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 42d135659..eba91cfd9 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -81,9 +81,6 @@ void bli_syrk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -94,6 +91,7 @@ void bli_syrk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &at_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index b44ddfcff..75549e2d0 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -131,9 +131,6 @@ void bli_trmm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, bli_obj_length( c_local ), @@ -144,6 +141,7 @@ void bli_trmm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index e672f7af3..f89b6ad96 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -130,9 +130,6 @@ void bli_trmm3_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, bli_obj_length( c_local ), @@ -143,6 +140,7 @@ void bli_trmm3_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index a731d8265..67b046952 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index a133f0bb0..48e4b4f1c 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 7b428c8ef..d4e809c50 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 78bd5eeb9..e05fc3d20 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,23 +50,27 @@ cntl_t* bli_trsm_l_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -81,15 +85,16 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -104,16 +109,18 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -129,23 +136,27 @@ cntl_t* bli_trsm_r_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -160,15 +171,16 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -183,16 +195,18 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -212,13 +226,14 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 6dbe9adce..cfd20cad3 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -55,8 +55,9 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 42bda8a51..47cff8b48 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -122,9 +122,6 @@ void bli_trsm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRSM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, bli_obj_length( c_local ), @@ -135,6 +132,7 @@ void bli_trsm_front bli_l3_thread_decorator ( bli_trsm_int, + BLIS_TRSM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c index 3a83faafd..de018d64a 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.c +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -64,7 +64,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (left side). trsm_l_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases @@ -78,7 +78,7 @@ void bli_trsm_cntl_init() trsm_l_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm @@ -93,7 +93,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (right side). trsm_r_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_NR, BLIS_MR, @@ -105,7 +105,7 @@ void bli_trsm_cntl_init() trsm_r_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // pack panels of B compactly BLIS_MR, BLIS_MR, @@ -119,7 +119,7 @@ void bli_trsm_cntl_init() // Create control tree object for lowest-level block-panel kernel. trsm_cntl_bp_ke = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + bli_trsm_cntl_create_node( BLIS_UNB_OPT, BLIS_VARIANT2, 0, // bszid_t not used by macro-kernel NULL, NULL, NULL, NULL, @@ -129,7 +129,7 @@ void bli_trsm_cntl_init() // problem (left side). trsm_l_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -144,7 +144,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (left side). trsm_l_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -159,7 +159,7 @@ void bli_trsm_cntl_init() // general problems (left side). trsm_l_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -174,7 +174,7 @@ void bli_trsm_cntl_init() // problem (right side). trsm_r_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -189,7 +189,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (right side). trsm_r_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -204,7 +204,7 @@ void bli_trsm_cntl_init() // general problems (right side). trsm_r_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -222,22 +222,22 @@ void bli_trsm_cntl_init() void bli_trsm_cntl_finalize() { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + bli_cntl_free_node( trsm_l_packa_cntl ); + bli_cntl_free_node( trsm_l_packb_cntl ); + bli_cntl_free_node( trsm_r_packa_cntl ); + bli_cntl_free_node( trsm_r_packb_cntl ); - bli_cntl_obj_free( trsm_cntl_bp_ke ); + bli_cntl_free_node( trsm_cntl_bp_ke ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + bli_cntl_free_node( trsm_l_cntl_op_bp ); + bli_cntl_free_node( trsm_l_cntl_mm_op ); + bli_cntl_free_node( trsm_l_cntl_vl_mm ); + bli_cntl_free_node( trsm_r_cntl_op_bp ); + bli_cntl_free_node( trsm_r_cntl_mm_op ); + bli_cntl_free_node( trsm_r_cntl_vl_mm ); } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/3/trsm/old/bli_trsm_cntl.h b/frame/3/trsm/old/bli_trsm_cntl.h index 651cc8599..bcdd1dfc7 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -51,7 +51,7 @@ typedef struct trsm_s trsm_t; void bli_trsm_cntl_init( void ); void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 0f8e38688..63fc81711 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,7 +35,7 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -47,16 +47,39 @@ blksz_t* bli_blksz_obj_create b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); - bli_blksz_obj_init( b, - b_s, be_s, - b_d, be_d, - b_c, be_c, - b_z, be_z ); + bli_blksz_init_ed + ( + b, + b_s, be_s, + b_d, be_d, + b_c, be_c, + b_z, be_z + ); return b; } -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + blksz_t* b; + + b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); + + bli_blksz_init + ( + b, + b_s, b_d, b_c, b_z, + be_s, be_d, be_c, be_z + ); + + return b; +} + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -75,7 +98,45 @@ void bli_blksz_obj_init b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Interpret a zero as a request for the default value. + b->e[BLIS_FLOAT] = ( be_s == 0 ? b_s : be_s ); + b->e[BLIS_DOUBLE] = ( be_d == 0 ? b_d : be_d ); + b->e[BLIS_SCOMPLEX] = ( be_c == 0 ? b_c : be_c ); + b->e[BLIS_DCOMPLEX] = ( be_z == 0 ? b_z : be_z ); +} + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Here we assume the maximum blocksize values can be the same as the + // default values. + b->e[BLIS_FLOAT] = b_s; + b->e[BLIS_DOUBLE] = b_d; + b->e[BLIS_SCOMPLEX] = b_c; + b->e[BLIS_DCOMPLEX] = b_z; +} + +void bli_blksz_free ( blksz_t* b ) diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index cfe2023e1..abd066f88 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -50,15 +50,6 @@ *(max) = bli_blksz_get_max( dt, b ); \ } -#define bli_blksz_get_def_for_obj( obj, b ) \ -\ - bli_blksz_get_def( bli_obj_datatype( *(obj) ), b ) - -#define bli_blksz_get_max_for_obj( obj, b ) \ -\ - bli_blksz_get_max( bli_obj_datatype( *(obj) ), b ) - - // blksz_t modification #define bli_blksz_set_def( val, dt, b ) \ @@ -85,8 +76,11 @@ #define bli_blksz_copy_dt( dt_src, b_src, \ dt_dst, b_dst ) \ { \ - (b_dst)->v[ dt_dst ] = (b_src)->v[ dt_src ]; \ - (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ + const dim_t v_src = bli_blksz_get_def( dt_src, b_src ); \ + const dim_t e_src = bli_blksz_get_max( dt_src, b_src ); \ +\ + bli_blksz_set_def( v_src, dt_dst, b_dst ); \ + bli_blksz_set_max( e_src, dt_dst, b_dst ); \ } #define bli_blksz_scale_def( num, den, dt, b ) \ @@ -109,7 +103,7 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -117,7 +111,13 @@ blksz_t* bli_blksz_obj_create dim_t b_z, dim_t be_z ); -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -126,7 +126,20 @@ void bli_blksz_obj_init dim_t b_z, dim_t be_z ); -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ); + +void bli_blksz_free ( blksz_t* b ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index cac290da9..90b2634a5 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -34,8 +34,9 @@ #include "blis.h" -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, @@ -48,6 +49,7 @@ cntl_t* bli_cntl_obj_create // Allocate the cntl_t struct. cntl = bli_malloc_intl( sizeof( cntl_t ) ); + bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); bli_cntl_set_var_func( var_func, cntl ); bli_cntl_set_params( params, cntl ); @@ -63,7 +65,7 @@ cntl_t* bli_cntl_obj_create return cntl; } -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ) @@ -71,7 +73,7 @@ void bli_cntl_obj_free bli_free_intl( cntl ); } -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ) @@ -141,7 +143,7 @@ void bli_cntl_free_w_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } void bli_cntl_free_wo_thrinfo @@ -177,7 +179,7 @@ void bli_cntl_free_wo_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } // ----------------------------------------------------------------------------- @@ -189,10 +191,11 @@ cntl_t* bli_cntl_copy { // Make a copy of the current node. Notice that the source node // should NOT have any allocated/cached mem_t entries, and that - // bli_cntl_obj_create() creates a node with a cleared mem_t + // bli_cntl_create_node() creates a node with a cleared mem_t // field. - cntl_t* cntl_copy = bli_cntl_obj_create + cntl_t* cntl_copy = bli_cntl_create_node ( + bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), NULL, NULL @@ -234,3 +237,23 @@ cntl_t* bli_cntl_copy return cntl_copy; } +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ) +{ + // Set the family of the root node. + bli_cntl_set_family( family, cntl ); + + // Continue as long as the current node has a valid child. + while ( bli_cntl_sub_node( cntl ) != NULL ) + { + // Move down the tree to the child node. + cntl = bli_cntl_sub_node( cntl ); + + // Set the family of the current node. + bli_cntl_set_family( family, cntl ); + } +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index fd0413f4f..332a6cd70 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -39,6 +39,7 @@ struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -57,20 +58,21 @@ typedef struct cntl_s cntl_t; // -- Control tree prototypes -- -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, cntl_t* sub_node ); -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ); -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ); @@ -99,10 +101,20 @@ cntl_t* bli_cntl_copy cntl_t* cntl ); +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ); + // ----------------------------------------------------------------------------- // cntl_t query (fields only) +#define bli_cntl_family( cntl ) \ +\ + ( cntl->family ) + #define bli_cntl_bszid( cntl ) \ \ ( cntl->bszid ) @@ -139,6 +151,11 @@ cntl_t* bli_cntl_copy // cntl_t modification +#define bli_cntl_set_family( family0, cntl ) \ +{ \ + cntl->family = family0; \ +} + #define bli_cntl_set_bszid( bszid0, cntl ) \ { \ cntl->bszid = bszid0; \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 29529924c..d4c4487ed 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -39,14 +39,14 @@ // NOTE: Since these functions currently do nothing, they are defined // as empty macros in bli_cntx. // -void bli_cntx_obj_create( cntx_t* cntx ) +void bli_cntx_create( cntx_t* cntx ) { // Since cntx_t objects contain statically-allocated arrays, // we don't need to do anything in order to create the cntx_t // instance. } -void bli_cntx_obj_free( cntx_t* cntx ) +void bli_cntx_free( cntx_t* cntx ) { // Just as we don't need to do anything in order to create a // cntx_t instance, we don't need to do anything to destory @@ -54,7 +54,7 @@ void bli_cntx_obj_free( cntx_t* cntx ) } #endif -void bli_cntx_obj_clear( cntx_t* cntx ) +void bli_cntx_clear( cntx_t* cntx ) { // Fill the entire cntx_t structure with zeros. memset( ( void* )cntx, 0, sizeof( cntx ) ); @@ -108,8 +108,11 @@ void bli_cntx_init( cntx_t* cntx ) // ----------------------------------------------------------------------------- -blksz_t* bli_cntx_get_blksz( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_blksz + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; @@ -142,8 +145,11 @@ dim_t bli_cntx_get_blksz_max_dt( num_t dt, } #endif -blksz_t* bli_cntx_get_bmult( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_bmult + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -166,8 +172,11 @@ dim_t bli_cntx_get_bmult_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -210,8 +219,11 @@ void* bli_cntx_get_l3_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_vir_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_vir_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_vir_ukr = &l3_vir_ukrs[ ukr_id ]; @@ -235,8 +247,11 @@ void* bli_cntx_get_l3_vir_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_nat_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_nat_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* l3_nat_ukr = &l3_nat_ukrs[ ukr_id ]; @@ -260,8 +275,11 @@ void* bli_cntx_get_l3_nat_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1f_ker + ( + l1fkr_t ker_id, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); func_t* l1f_ker = &l1f_kers[ ker_id ]; @@ -283,8 +301,11 @@ void* bli_cntx_get_l1f_ker_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1v_ker + ( + l1vkr_t ker_id, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); func_t* l1v_ker = &l1v_kers[ ker_id ]; @@ -306,8 +327,11 @@ void* bli_cntx_get_l1v_ker_dt( num_t dt, } #endif -mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, - cntx_t* cntx ) +mbool_t* bli_cntx_get_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* l3_nat_ukrs_pref = &l3_nat_ukrs_prefs[ ukr_id ]; @@ -316,12 +340,30 @@ mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, return l3_nat_ukrs_pref; } -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ) +func_t* bli_cntx_get_packm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) { - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + func_t* packm_ker = &packm_kers[ ker_id ]; // Return the address of the func_t that contains the packm ukernels. - return packm_ukrs; + return packm_ker; +} + +func_t* bli_cntx_get_unpackm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) +{ + func_t* unpackm_kers = bli_cntx_unpackm_kers_buf( cntx ); + func_t* unpackm_ker = &unpackm_kers[ ker_id ]; + + // Return the address of the func_t that contains the unpackm ukernels. + return unpackm_ker; } #if 0 @@ -360,7 +402,11 @@ dim_t bli_cntx_get_num_threads( cntx_t* cntx ) bli_cntx_ir_way( cntx ); } -dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +dim_t bli_cntx_get_num_threads_in + ( + cntx_t* cntx, + cntl_t* cntl + ) { dim_t n_threads_in = 1; @@ -384,14 +430,6 @@ dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) // ----------------------------------------------------------------------------- -#if 1 -// -// NOTE: This function is disabled because: -// - we currently do not have any need to set a context direclty with -// blksz_t objects -// - it may be broken; it needs to be synced up with the corresponding -// function in bli_gks.c. -// void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: @@ -454,8 +492,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the bszid_t of the multiple + // that we need to associate with the blksz_t object. bszid_t bs_id = va_arg( args, bszid_t ); blksz_t* blksz = va_arg( args, blksz_t* ); bszid_t bm_id = va_arg( args, bszid_t ); @@ -473,9 +511,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the address of the blksz_t object, + // - the bszid_t of the multiple, and // - the scalars we wish to apply to the real blocksizes to // come up with the induced complex blocksizes (for default // and maximum blocksizes). @@ -536,6 +573,7 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // location within the context's blksz_t array. Do the same // for the blocksize multiple id. //cntx_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy_smart( blksz, cntx_blksz ); bli_blksz_copy( blksz, cntx_blksz ); // Copy the blocksize multiple id into the context. @@ -624,14 +662,16 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( dsclrs ); bli_free_intl( msclrs ); } -#endif // ----------------------------------------------------------------------------- -void bli_cntx_set_blksz( bszid_t bs_id, - blksz_t* blksz, - bszid_t mult_id, - cntx_t* cntx ) +void bli_cntx_set_blksz + ( + bszid_t bs_id, + blksz_t* blksz, + bszid_t mult_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -645,20 +685,111 @@ void bli_cntx_set_blksz( bszid_t bs_id, bmults[ bs_id ] = mult_id; } -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) -{ - func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the specified location within - // the context's virtual level-3 ukernel array. - l3_vir_ukrs[ ukr_id ] = *func; +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_l3_nat_ukrs + ( + dim_t n_ukrs, + l3ukr_t ukr0_id, num_t dt0, void* ukr0_fp, bool_t pref0, + l3ukr_t ukr1_id, num_t dt1, void* ukr1_fp, bool_t pref1, + l3ukr_t ukr2_id, num_t dt2, void* ukr2_fp, bool_t pref2, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_ukrs ); + + // Process n_ukrs tuples. + for ( i = 0; i < n_ukrs; ++i ) + { + // Here, we query the variable argument list for: + // - the l3ukr_t of the kernel we're about to process, + // - the datatype of the kernel, + // - the kernel function pointer, and + // - the kernel function storage preference + // that we need to store to the context. + const l3ukr_t ukr_id = va_arg( args, l3ukr_t ); + const num_t ukr_dt = va_arg( args, num_t ); + void* ukr_fp = va_arg( args, void* ); + const bool_t ukr_pref = va_arg( args, bool_t ); + + // Store the values in our temporary arrays. + ukr_ids[ i ] = ukr_id; + ukr_dts[ i ] = ukr_dt; + ukr_fps[ i ] = ukr_fp; + ukr_prefs[ i ] = ukr_pref; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the l3 native ukernel func_t array + // - the l3 native ukernel preferences array + func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); + mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_ukrs; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l3ukr_t ukr_id = ukr_ids[ i ]; + const num_t ukr_dt = ukr_dts[ i ]; + void* ukr_fp = ukr_fps[ i ]; + const bool_t ukr_pref = ukr_prefs[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; + mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); + bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); + } + + // Free the temporary local arrays. + bli_free_intl( ukr_ids ); + bli_free_intl( ukr_dts ); + bli_free_intl( ukr_fps ); + bli_free_intl( ukr_prefs ); } -void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_l3_nat_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -667,9 +798,12 @@ void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, l3_nat_ukrs[ ukr_id ] = *func; } -void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, - mbool_t* prefs, - cntx_t* cntx ) +void bli_cntx_set_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); @@ -678,9 +812,26 @@ void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, l3_nat_ukrs_prefs[ ukr_id ] = *prefs; } -void bli_cntx_set_l1f_ker( l1fkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l3_vir_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); + + // Copy the function object into the specified location within + // the context's virtual level-3 ukernel array. + l3_vir_ukrs[ ukr_id ] = *func; +} + +void bli_cntx_set_l1f_ker + ( + l1fkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); @@ -689,9 +840,12 @@ void bli_cntx_set_l1f_ker( l1fkr_t ker_id, l1f_kers[ ker_id ] = *func; } -void bli_cntx_set_l1v_ker( l1vkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l1v_ker + ( + l1vkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); @@ -700,43 +854,154 @@ void bli_cntx_set_l1v_ker( l1vkr_t ker_id, l1v_kers[ ker_id ] = *func; } -void bli_cntx_set_packm_ukr( func_t* func, - cntx_t* cntx ) -{ - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the context's packm ukernel object. - *packm_ukrs = *func; +void bli_cntx_set_packm_kers( dim_t n_kers, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_packm_kers + ( + dim_t n_ukrs, + l1mkr_t ker0_id, num_t ker0_dt, void* ker0_fp, + l1mkr_t ker1_id, num_t ker1_dt, void* ker1_fp, + l1mkr_t ker2_id, num_t ker2_dt, void* ker2_fp, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_kers ); + + // Process n_kers tuples. + for ( i = 0; i < n_kers; ++i ) + { + // Here, we query the variable argument list for: + // - the l1mkr_t of the kernel we're about to process, + // - the datatype of the kernel, and + // - the kernel function pointer + // that we need to store to the context. + const l1mkr_t ker_id = va_arg( args, l1mkr_t ); + const num_t ker_dt = va_arg( args, num_t ); + void* ker_fp = va_arg( args, void* ); + + // Store the values in our temporary arrays. + ker_ids[ i ] = ker_id; + ker_dts[ i ] = ker_dt; + ker_fps[ i ] = ker_fp; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the address of: + // - the packm kernels func_t array + func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_kers; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l1mkr_t ker_id = ker_ids[ i ]; + const num_t ker_dt = ker_dts[ i ]; + void* ker_fp = ker_fps[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* kers = &cntx_packm_kers[ ker_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ker_fp, ker_dt, kers ); + } + + // Free the temporary local arrays. + bli_free_intl( ker_ids ); + bli_free_intl( ker_dts ); + bli_free_intl( ker_fps ); } -void bli_cntx_set_ind_method( ind_t method, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_packm_ker + ( + l1mkr_t ker_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Copy the function object into the specified location within + // the context's packm kernel array. + packm_kers[ ker_id ] = *func; +} + +// ----------------------------------------------------------------------------- + +void bli_cntx_set_ind_method + ( + ind_t method, + cntx_t* cntx + ) { bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel + ( + pack_t schema_a, + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a_block( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block + ( + pack_t schema_a, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel + ( + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_c_panel + ( + pack_t schema_c, + cntx_t* cntx + ) { bli_cntx_set_schema_c_panel( schema_c, cntx ); } @@ -749,8 +1014,15 @@ void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, } #endif -void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, - dim_t m, dim_t n, dim_t k ) +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + cntx_t* cntx, + dim_t m, + dim_t n, + dim_t k + ) { dim_t jc, pc, ic, jr, ir; @@ -882,9 +1154,12 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -894,9 +1169,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, return ukr_prefs == TRUE; } -bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -906,16 +1184,22 @@ bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, return ukr_prefs == FALSE; } -bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { const num_t dt = bli_obj_datatype( *obj ); const bool_t ukr_prefers_rows @@ -930,9 +1214,12 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -942,9 +1229,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -956,9 +1246,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -968,9 +1261,12 @@ bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -980,16 +1276,22 @@ bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { num_t dt = bli_obj_datatype( *obj ); @@ -1005,9 +1307,12 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -1017,9 +1322,12 @@ bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -1108,23 +1416,6 @@ void bli_cntx_print( cntx_t* cntx ) ); } - { - func_t* ukr = bli_cntx_get_packm_ukr( cntx ); - - printf( "packm ker : %16p %16p %16p %16p\n", - bli_func_get_dt( BLIS_FLOAT, ukr ), - bli_func_get_dt( BLIS_DOUBLE, ukr ), - bli_func_get_dt( BLIS_SCOMPLEX, ukr ), - bli_func_get_dt( BLIS_DCOMPLEX, ukr ) - ); - } - - { - ind_t family = bli_cntx_get_family( cntx ); - - printf( "oper family : %lu\n", ( guint_t )family ); - } - { ind_t method = bli_cntx_get_ind_method( cntx ); diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index a76cdd329..3167d1bf4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -36,6 +36,9 @@ #ifndef BLIS_CNTX_H #define BLIS_CNTX_H +//#include "bli_cntx_init.h" + + // Context object type (defined in bli_type_defs.h) /* @@ -51,9 +54,9 @@ typedef struct cntx_s func_t* l1f_kers; func_t* l1v_kers; - func_t packm_ukrs; + func_t* packm_kers; + func_t* unpackm_kers; - opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; @@ -99,17 +102,13 @@ typedef struct cntx_s \ ( (cntx)->l1v_kers ) -#define bli_cntx_packm_ukrs_buf( cntx ) \ +#define bli_cntx_packm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) + ( (cntx)->packm_kers ) -#define bli_cntx_packm_ukrs( cntx ) \ +#define bli_cntx_unpackm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) - -#define bli_cntx_family( cntx ) \ -\ - ( (cntx)->family ) + ( (cntx)->unpackm_kers ) #define bli_cntx_method( cntx ) \ \ @@ -202,16 +201,6 @@ typedef struct cntx_s (cntx_p)->l1v_kers = _l1v_kers; \ } -#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ -{ \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ -} - -#define bli_cntx_set_family( _family, cntx_p ) \ -{ \ - (cntx_p)->family = _family; \ -} - #define bli_cntx_set_method( _method, cntx_p ) \ { \ (cntx_p)->method = _method; \ @@ -285,7 +274,8 @@ typedef struct cntx_s ( \ (dt), \ &(( \ - bli_cntx_method( (cntx) ) != BLIS_NAT \ + bli_cntx_method( (cntx) ) != BLIS_NAT && \ + bli_is_complex( dt ) \ ? bli_cntx_l3_vir_ukrs_buf( (cntx) ) \ : bli_cntx_l3_nat_ukrs_buf( (cntx) ) \ )[ ukr_id ]) \ @@ -326,10 +316,6 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) -#define bli_cntx_get_family( cntx ) \ -\ - bli_cntx_family( cntx ) - #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -357,9 +343,9 @@ typedef struct cntx_s // create/free -//void bli_cntx_obj_create( cntx_t* cntx ); -//void bli_cntx_obj_free( cntx_t* cntx ); -void bli_cntx_obj_clear( cntx_t* cntx ); +//void bli_cntx_create( cntx_t* cntx ); +//void bli_cntx_free( cntx_t* cntx ); +void bli_cntx_clear( cntx_t* cntx ); void bli_cntx_init( cntx_t* cntx ); // get functions @@ -380,7 +366,7 @@ func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, cntx_t* cntx ); func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, cntx_t* cntx ); -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); +//func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //dim_t bli_cntx_get_blksz_def_dt( num_t dt, // bszid_t bs_id, @@ -409,6 +395,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //void* bli_cntx_get_l1v_ker_dt( num_t dt, // l1vkr_t ker_id, // cntx_t* cntx ); +func_t* bli_cntx_get_packm_ker( l1mkr_t ker_id, + cntx_t* cntx ); +func_t* bli_cntx_get_unpackm_ker( l1mkr_t ker_id, + cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); @@ -425,18 +415,34 @@ void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ); -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ); + +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); + void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ); +void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx ); + +void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ); + void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ); + +void bli_cntx_set_packm_kers( dim_t n_kers, ... ); + +void bli_cntx_set_packm_ker( l1mkr_t ker_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, @@ -507,11 +513,11 @@ void bli_cntx_print( cntx_t* cntx ); // Preprocess out these calls entirely, since they are currently just empty // functions that do nothing. #if 0 - #define bli_cntx_obj_create( cntx ) { bli_cntx_obj_clear( cntx ); } - #define bli_cntx_obj_free( cntx ) { bli_cntx_obj_clear( cntx ); } + #define bli_cntx_create( cntx ) { bli_cntx_clear( cntx ); } + #define bli_cntx_free( cntx ) { bli_cntx_clear( cntx ); } #else - #define bli_cntx_obj_create( cntx ) { ; } - #define bli_cntx_obj_free( cntx ) { ; } + #define bli_cntx_create( cntx ) { ; } + #define bli_cntx_free( cntx ) { ; } #endif // These macros initialize/finalize a local context if the given context diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index 75be26085..d098b4c9d 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -35,37 +35,57 @@ #include "blis.h" -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { func_t* f; f = ( func_t* ) bli_malloc_intl( sizeof(func_t) ); - bli_func_obj_init( f, - ptr_s, - ptr_d, - ptr_c, - ptr_z ); + bli_func_init + ( + f, + ptr_s, + ptr_d, + ptr_c, + ptr_z + ); return f; } -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { - f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s; - f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d; - f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c; - f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z; + bli_func_set_dt( ptr_s, BLIS_FLOAT, f ); + bli_func_set_dt( ptr_d, BLIS_DOUBLE, f ); + bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f ); + bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f ); } -void bli_func_obj_free( func_t* f ) +void bli_func_init_null + ( + func_t* f + ) +{ + bli_func_set_dt( NULL, BLIS_FLOAT, f ); + bli_func_set_dt( NULL, BLIS_DOUBLE, f ); + bli_func_set_dt( NULL, BLIS_SCOMPLEX, f ); + bli_func_set_dt( NULL, BLIS_DCOMPLEX, f ); +} + +void bli_func_free( func_t* f ) { bli_free_intl( f ); } @@ -75,7 +95,7 @@ void bli_func_obj_free( func_t* f ) bool_t bli_func_is_null_dt( num_t dt, func_t* f ) { - return ( f->ptr[ dt ] == NULL ); + return ( bli_func_get_dt( dt, f ) == NULL ); } bool_t bli_func_is_null( func_t* f ) @@ -87,7 +107,7 @@ bool_t bli_func_is_null( func_t* f ) // return FALSE. Otherwise, if they are all null, return TRUE. for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { - if ( f->ptr[ dt ] != NULL ) + if ( bli_func_get_dt( dt, f ) != NULL ) { r_val = FALSE; break; diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h index 56b221be9..2bfc2ad20 100644 --- a/frame/base/bli_func.h +++ b/frame/base/bli_func.h @@ -49,18 +49,29 @@ // ----------------------------------------------------------------------------- -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_free( func_t* f ); +void bli_func_init_null + ( + func_t* f + ); + +void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 2ada1556e..4d819babe 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -74,12 +74,6 @@ static blksz_t bli_gks_blkszs[BLIS_NUM_BLKSZS] = /* df */ { { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, }, { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, } }, -/* xf */ { { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, }, - { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, } - }, -/* vf */ { { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, }, - { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, } - }, }; // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_mbool.c b/frame/base/bli_mbool.c index 46ba531bc..6906622d1 100644 --- a/frame/base/bli_mbool.c +++ b/frame/base/bli_mbool.c @@ -35,29 +35,38 @@ #include "blis.h" -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { mbool_t* b; b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) ); - bli_mbool_obj_init( b, - b_s, - b_d, - b_c, - b_z ); + bli_mbool_init + ( + b, + b_s, + b_d, + b_c, + b_z + ); return b; } -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { bli_mbool_set_dt( b_s, BLIS_FLOAT, b ); bli_mbool_set_dt( b_d, BLIS_DOUBLE, b ); @@ -65,7 +74,7 @@ void bli_mbool_obj_init( mbool_t* b, bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b ); } -void bli_mbool_obj_free( mbool_t* b ) +void bli_mbool_free( mbool_t* b ) { bli_free_intl( b ); } diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h index 5d5f47828..181543413 100644 --- a/frame/base/bli_mbool.h +++ b/frame/base/bli_mbool.h @@ -49,16 +49,22 @@ // ----------------------------------------------------------------------------- -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_free( mbool_t* b ); +void bli_mbool_free( mbool_t* b ); diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 1a120d5da..d71d84f31 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -631,6 +631,80 @@ typedef enum #define BLIS_NUM_LEVEL1F_KERS 5 +typedef enum +{ + BLIS_PACKM_0XK_KER = 0, + BLIS_PACKM_1XK_KER = 1, + BLIS_PACKM_2XK_KER = 2, + BLIS_PACKM_3XK_KER = 3, + BLIS_PACKM_4XK_KER = 4, + BLIS_PACKM_5XK_KER = 5, + BLIS_PACKM_6XK_KER = 6, + BLIS_PACKM_7XK_KER = 7, + BLIS_PACKM_8XK_KER = 8, + BLIS_PACKM_9XK_KER = 9, + BLIS_PACKM_10XK_KER = 10, + BLIS_PACKM_11XK_KER = 11, + BLIS_PACKM_12XK_KER = 12, + BLIS_PACKM_13XK_KER = 13, + BLIS_PACKM_14XK_KER = 14, + BLIS_PACKM_15XK_KER = 15, + BLIS_PACKM_16XK_KER = 16, + BLIS_PACKM_17XK_KER = 17, + BLIS_PACKM_18XK_KER = 18, + BLIS_PACKM_19XK_KER = 19, + BLIS_PACKM_20XK_KER = 20, + BLIS_PACKM_21XK_KER = 21, + BLIS_PACKM_22XK_KER = 22, + BLIS_PACKM_23XK_KER = 23, + BLIS_PACKM_24XK_KER = 24, + BLIS_PACKM_25XK_KER = 25, + BLIS_PACKM_26XK_KER = 26, + BLIS_PACKM_27XK_KER = 27, + BLIS_PACKM_28XK_KER = 28, + BLIS_PACKM_29XK_KER = 29, + BLIS_PACKM_30XK_KER = 30, + BLIS_PACKM_31XK_KER = 31, + + BLIS_UNPACKM_0XK_KER = 0, + BLIS_UNPACKM_1XK_KER = 1, + BLIS_UNPACKM_2XK_KER = 2, + BLIS_UNPACKM_3XK_KER = 3, + BLIS_UNPACKM_4XK_KER = 4, + BLIS_UNPACKM_5XK_KER = 5, + BLIS_UNPACKM_6XK_KER = 6, + BLIS_UNPACKM_7XK_KER = 7, + BLIS_UNPACKM_8XK_KER = 8, + BLIS_UNPACKM_9XK_KER = 9, + BLIS_UNPACKM_10XK_KER = 10, + BLIS_UNPACKM_11XK_KER = 11, + BLIS_UNPACKM_12XK_KER = 12, + BLIS_UNPACKM_13XK_KER = 13, + BLIS_UNPACKM_14XK_KER = 14, + BLIS_UNPACKM_15XK_KER = 15, + BLIS_UNPACKM_16XK_KER = 16, + BLIS_UNPACKM_17XK_KER = 17, + BLIS_UNPACKM_18XK_KER = 18, + BLIS_UNPACKM_19XK_KER = 19, + BLIS_UNPACKM_20XK_KER = 20, + BLIS_UNPACKM_21XK_KER = 21, + BLIS_UNPACKM_22XK_KER = 22, + BLIS_UNPACKM_23XK_KER = 23, + BLIS_UNPACKM_24XK_KER = 24, + BLIS_UNPACKM_25XK_KER = 25, + BLIS_UNPACKM_26XK_KER = 26, + BLIS_UNPACKM_27XK_KER = 27, + BLIS_UNPACKM_28XK_KER = 28, + BLIS_UNPACKM_29XK_KER = 29, + BLIS_UNPACKM_30XK_KER = 30, + BLIS_UNPACKM_31XK_KER = 31, + +} l1mkr_t; + +#define BLIS_NUM_PACKM_KERS 32 +#define BLIS_NUM_UNPACKM_KERS 32 + + typedef enum { BLIS_GEMM_UKR = 0, @@ -683,7 +757,7 @@ typedef enum // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in -// bli_ind_query.c to index into arrays. +// bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_HEMM, @@ -714,16 +788,14 @@ typedef enum BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension - BLIS_1F, // level-1f global fusing factor BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_VF, // level-1v vector fusing factor BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. } bszid_t; -#define BLIS_NUM_BLKSZS 13 +#define BLIS_NUM_BLKSZS 11 // @@ -784,6 +856,7 @@ typedef struct mem_s struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -971,9 +1044,9 @@ typedef struct cntx_s func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - func_t packm_ukrs; + func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; + func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; - opid_t family; ind_t method; pack_t schema_a_block; pack_t schema_b_panel; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index 5b7a70c3c..03a4d4d91 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -122,7 +122,7 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -170,7 +170,7 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M2; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -218,7 +218,7 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M3; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -279,7 +279,7 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -343,7 +343,7 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -391,7 +391,7 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1B; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -439,7 +439,7 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -524,7 +524,7 @@ void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 96f9add60..d3127b81f 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -41,7 +41,7 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -89,7 +89,7 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -137,7 +137,7 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 5777c5b6d..131f70973 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -207,6 +207,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -234,7 +235,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -252,7 +253,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 27fb37e6a..16ef5a157 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -132,6 +132,7 @@ void* bli_l3_thread_entry( void* data_void ); typedef struct thread_data { l3int_t func; + opid_t family; obj_t* alpha; obj_t* a; obj_t* b; @@ -148,6 +149,7 @@ void* bli_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; + opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; obj_t* b = data->b; @@ -162,13 +164,14 @@ void* bli_l3_thread_entry( void* data_void ) thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); data->func ( + family, alpha, a, b, @@ -180,7 +183,7 @@ void* bli_l3_thread_entry( void* data_void ) ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); @@ -191,6 +194,7 @@ void* bli_l3_thread_entry( void* data_void ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -217,6 +221,7 @@ void bli_l3_thread_decorator { // Set up thread data for additional threads (beyond thread 0). datas[id].func = func; + datas[id].family = family; datas[id].alpha = alpha; datas[id].a = a; datas[id].b = b; diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index 76b48ca95..cb0bc2ae4 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -74,6 +74,7 @@ void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -94,7 +95,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -112,7 +113,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 1dde88206..2d150c656 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -218,9 +218,10 @@ siz_t bli_thread_get_range_l2r dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, FALSE, start, end ); @@ -237,9 +238,10 @@ siz_t bli_thread_get_range_r2l dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, TRUE, start, end ); @@ -256,9 +258,10 @@ siz_t bli_thread_get_range_t2b dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, FALSE, start, end ); @@ -275,9 +278,10 @@ siz_t bli_thread_get_range_b2t dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, TRUE, start, end ); @@ -649,7 +653,7 @@ siz_t bli_thread_get_range_mdim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -708,7 +712,7 @@ siz_t bli_thread_get_range_ndim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -771,11 +775,12 @@ siz_t bli_thread_get_range_weighted_l2r if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -820,11 +825,12 @@ siz_t bli_thread_get_range_weighted_r2l if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -871,11 +877,12 @@ siz_t bli_thread_get_range_weighted_t2b if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -922,11 +929,12 @@ siz_t bli_thread_get_range_weighted_b2t if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 9092bc84d..a88d24bc0 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -155,6 +155,7 @@ typedef void (*l3int_t) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 993c134b4..84552b569 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -1903,7 +1903,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia else does_inv_diag = TRUE; // Create a control tree node for the packing operation. - cntl_t* cntl = bli_packm_cntl_obj_create + cntl_t* cntl = bli_packm_cntl_create_node ( NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bli_packm_blk_var1, From 803bbef0a386dd0571ad389f69d55154dbfe3c50 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 29 Jul 2017 20:17:05 -0500 Subject: [PATCH 16/21] Fixed pthreads compile bug with previous commit. Details: - Erroneously passed family parameter into l3int_t function despite that function not taking the parameter. Oops. --- frame/thread/bli_thrcomm_pthreads.c | 1 - 1 file changed, 1 deletion(-) diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 16ef5a157..540e161c8 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -171,7 +171,6 @@ void* bli_l3_thread_entry( void* data_void ) data->func ( - family, alpha, a, b, From cecdc05d2834786a84ff85775d3f99a958c0765a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 31 Jul 2017 15:19:51 -0500 Subject: [PATCH 17/21] Change lsame_ signature to match lapacke. --- frame/compat/f2c/bla_lsame.c | 11 ++++++++--- frame/compat/f2c/bla_lsame.h | 6 +++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c index 04f8caad0..7b109ab43 100644 --- a/frame/compat/f2c/bla_lsame.c +++ b/frame/compat/f2c/bla_lsame.c @@ -41,7 +41,12 @@ -lf2c -lm (in that order) */ -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len) + +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len) +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len) +#endif { /* System generated locals */ bla_logical ret_val; @@ -115,11 +120,11 @@ bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, f /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ /* upper case 'Z'. */ - if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) { inta += 64; } - if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) { intb += 64; } diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h index 7e2f92389..e8f63f488 100644 --- a/frame/compat/f2c/bla_lsame.h +++ b/frame/compat/f2c/bla_lsame.h @@ -34,6 +34,10 @@ #ifdef BLIS_ENABLE_BLAS2BLIS -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len); +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len); +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len); +#endif #endif From b01c80829907d50ec79977fba8e7b53cfe7db80a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 4 Aug 2017 14:17:44 -0500 Subject: [PATCH 18/21] Fixed a minor bug in level-3 packm management. Details: - Fixed a bug in bli_l3_packm() that caused cntl_t-cached packed mem_t entries to be released and then re-acquired unnecessarily. (In essence, the "<" operands in the conditional that guards the release-and-reacquire code block simply needed to be swapped.) The bug should have only affected performance (rather than the computed result). Thanks to Minh Quan for identifying and reporting the bug. --- frame/3/bli_l3_packm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 28fb1f857..82383f93a 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -115,12 +115,13 @@ void bli_l3_packm // buffer, then a block has already been acquired from the memory // broker and cached in the control tree. - // BUT, we need to make sure that the mem_t object is not associated - // with a block that is too small given the size of the packed matrix - // that we need, according to the return value from packm_init(). + // As a sanity check, we should make sure that the mem_t object isn't + // associated with a block that is too small compared to the size of + // the packed matrix buffer that is needed, according to the return + // value from packm_init(). siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - if ( size_needed < cntl_mem_size ) + if ( cntl_mem_size < size_needed ) { if ( bli_thread_am_ochief( thread ) ) { From 60a1eeb2317939d732b9eb6ff1e0d6d668c9a1e5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 5 Aug 2017 13:04:31 -0500 Subject: [PATCH 19/21] Added edge handling to _determine_blocksize_b(). Details: - Added explicit handling of situations where i == dim to bli_determine_blocksize_b_sub(). This isn't actually needed by any current use case within BLIS, but handling the situation is nonetheless prudent. Thanks to Minh Quan for reporting this issue and requesting the fix. --- frame/base/bli_blksz.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 63fc81711..6d27c52d5 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -363,6 +363,11 @@ dim_t bli_determine_blocksize_b_sub // chunk that will correspond to the blocksize we are computing now. dim_left_now = dim - i; + // Sanity check: if dim_left_now is zero, then we can return zero + // without going any further. + if ( dim_left_now == 0 ) + return 0; + dim_at_edge = dim_left_now % b_alg; // If dim_left_now is a multiple of b_alg, we can safely return b_alg From f86ce54d6f315006984534fe29e47a2deaacc9f5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 10 Aug 2017 16:24:28 -0500 Subject: [PATCH 20/21] Removed trailing enum commas from bli_type_defs.h. Details: - Removed trailing commas from enums in bli_type_defs.h. Thanks to Erling Andersen for pointing out this inconsistency and suggesting the change. --- frame/include/bli_type_defs.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d71d84f31..517a17b13 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -438,7 +438,7 @@ typedef enum BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, - BLIS_DT_HI = BLIS_DCOMPLEX, + BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum @@ -482,7 +482,7 @@ typedef enum BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start @@ -511,7 +511,7 @@ typedef enum BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, - BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE, + BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; @@ -590,7 +590,7 @@ typedef enum BLIS_4M1B, BLIS_4M1A, BLIS_1M, - BLIS_NAT, + BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) @@ -613,7 +613,7 @@ typedef enum BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, - BLIS_XPBYV_KER, + BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 @@ -625,7 +625,7 @@ typedef enum BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER, + BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 @@ -697,7 +697,7 @@ typedef enum BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31, + BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; @@ -711,7 +711,7 @@ typedef enum BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR, + BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 @@ -722,7 +722,7 @@ typedef enum BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, - BLIS_NOTAPPLIC_UKERNEL, + BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 @@ -736,7 +736,7 @@ typedef enum BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, - BLIS_PR_IDX, + BLIS_PR_IDX } thridx_t; #endif @@ -770,7 +770,7 @@ typedef enum BLIS_TRMM, BLIS_TRSM, - BLIS_NOID, + BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 10 @@ -792,7 +792,7 @@ typedef enum BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. + BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 @@ -1065,7 +1065,7 @@ typedef struct cntx_s typedef enum { BLIS_NO_ERROR_CHECKING = 0, - BLIS_FULL_ERROR_CHECKING, + BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum From 7dc78b49f97e6b3cd6d72fcdc588ace534d0e700 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 15 Aug 2017 10:02:25 -0500 Subject: [PATCH 21/21] Add vzeroupper to Intel AVX kernels. --- kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c | 4 ++++ kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c | 4 ++++ kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 8 ++++++++ kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c | 8 ++++++++ kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c | 10 +++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c index 5bc2dd4ba..2088e030a 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c @@ -634,6 +634,8 @@ void bli_sgemm_asm_24x4 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1253,6 +1255,8 @@ void bli_dgemm_asm_12x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c index c92612b07..5eb0f0732 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c @@ -600,6 +600,8 @@ void bli_sgemm_asm_4x24 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1186,6 +1188,8 @@ void bli_dgemm_asm_4x12 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 5bd2d92e5..78b294053 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -595,6 +595,8 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1175,6 +1177,8 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1702,6 +1706,8 @@ void bli_cgemm_asm_3x8 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2228,6 +2234,8 @@ void bli_zgemm_asm_3x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c index f173947c3..9796e27ef 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c @@ -596,6 +596,8 @@ void bli_sgemm_asm_16x6 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1176,6 +1178,8 @@ void bli_dgemm_asm_8x6 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1703,6 +1707,8 @@ void bli_cgemm_asm_8x3 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2229,6 +2235,8 @@ void bli_zgemm_asm_4x3 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index f8db398ca..f19f053fc 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -991,7 +991,9 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" ".SDONE: \n\t" - " \n\t" + " \n\t" + "vzeroupper \n\t" + " \n\t" : // output operands (none) : // input operands @@ -1658,6 +1660,8 @@ void bli_dgemm_asm_8x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2611,6 +2615,8 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -3453,6 +3459,8 @@ void bli_zgemm_asm_4x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none)