Merge branch 'master' into 1m

2026-05-13 10:35:38 +00:00 · 2017-05-02 12:30:28 -05:00
parent a509fbd5ac ca3a792477
commit e80993e71f
2 changed files with 95 additions and 57 deletions
--- a/README.md
+++ b/README.md
@@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving
 ```

 A fourth paper, submitted to ACM TOMS, also exists, which proposes an
-[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS: 
+[analytical model](http://dl.acm.org/citation.cfm?id=2925987) 
+([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf))
+for determining blocksize parameters in BLIS: 

 ```
@article{BLIS4,
@@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an
 }
 ```

+A fifth paper, submitted to ACM TOMS, begins the study of so-called
+[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)):
+
+```
+@article{BLIS5,
+   author      = {Field G. {V}an~{Z}ee and Tyler Smith},
+   title       = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods},
+   journal     = {ACM Transactions on Mathematical Software},
+   year        = {2017},
+   note        = {accepted}
+}
+``` 
+
+A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
+article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)):
+
+```
+@article{BLIS6,
+   author      = {Field G. {V}an~{Z}ee},
+   title       = {Implementing high-performance complex matrix multiplication via the 1m method},
+   journal     = {ACM Transactions on Mathematical Software},
+   note        = {submitted}
+}
+``` 
+
+
 Funding
 -------

--- a/kernels/armv8a/3/bli_gemm_opt_4x4.c
+++ b/kernels/armv8a/3/bli_gemm_opt_4x4.c
@@ -50,16 +50,17 @@
 * Tested on Juno board. Around  3.1 GFLOPS, 1 x A53 core  @ 850 MHz. 
 * Tested on Juno board. Around 12   GFLOPS, 4 x A53 cores @ 850 MHz.
 */
-void bli_sgemm_opt_8x12(
-                        dim_t              k,
-                        float*    restrict alpha,
-                        float*    restrict a,
-                        float*    restrict b,
-                        float*    restrict beta,
-                        float*    restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data,
-                        cntx_t*   restrict cntx
-                      )
+void bli_sgemm_opt_8x12
+     (
+       dim_t               k,
+       float*     restrict alpha,
+       float*     restrict a,
+       float*     restrict b,
+       float*     restrict beta,
+       float*     restrict c, inc_t rs_c, inc_t cs_c,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
@@ -1100,16 +1101,17 @@ __asm__ volatile
 * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core  @ 850 MHz. 
 * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
 */
-void bli_dgemm_opt_6x8(
-                        dim_t              k,
-                        double*   restrict alpha,
-                        double*   restrict a,
-                        double*   restrict b,
-                        double*   restrict beta,
-                        double*   restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data,
-                        cntx_t*   restrict cntx
-                      )
+void bli_dgemm_opt_6x8
+     (
+       dim_t               k,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c, inc_t cs_c,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
@@ -2070,47 +2072,55 @@ __asm__ volatile

 }

-void bli_cgemm_opt_4x4(
-                        dim_t              k,
-                        scomplex* restrict alpha,
-                        scomplex* restrict a,
-                        scomplex* restrict b,
-                        scomplex* restrict beta,
-                        scomplex* restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data,
-                        cntx_t*   restrict cntx
-                      )
+void bli_cgemm_opt_4x4
+     (
+       dim_t               k,
+       scomplex*  restrict alpha,
+       scomplex*  restrict a,
+       scomplex*  restrict b,
+       scomplex*  restrict beta,
+       scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
 	/* Just call the reference implementation. */
-	BLIS_CGEMM_UKERNEL_REF( k,
-	                   alpha,
-	                   a,
-	                   b,
-	                   beta,
-	                   c, rs_c, cs_c,
-	                   data,
-	                   cntx );
+	BLIS_CGEMM_UKERNEL_REF
+	(
+	  k,
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c, rs_c, cs_c,
+	  data,
+	  cntx
+	);
 }

-void bli_zgemm_opt_4x4(
-                        dim_t              k,
-                        dcomplex* restrict alpha,
-                        dcomplex* restrict a,
-                        dcomplex* restrict b,
-                        dcomplex* restrict beta,
-                        dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
-                        auxinfo_t*         data,
-                        cntx_t*   restrict cntx
-                      )
+void bli_zgemm_opt_4x4
+     (
+       dim_t               k,
+       dcomplex*  restrict alpha,
+       dcomplex*  restrict a,
+       dcomplex*  restrict b,
+       dcomplex*  restrict beta,
+       dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
 	/* Just call the reference implementation. */
-	BLIS_ZGEMM_UKERNEL_REF( k,
-	                   alpha,
-	                   a,
-	                   b,
-	                   beta,
-	                   c, rs_c, cs_c,
-	                   data,
-	                   cntx );
+	BLIS_ZGEMM_UKERNEL_REF
+	(
+	  k,
+	  alpha,
+	  a,
+	  b,
+	  beta,
+	  c, rs_c, cs_c,
+	  data,
+	  cntx
+	);
 }