diff --git a/README.md b/README.md index 9bfa84285..1d7b0ce34 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving ``` A fourth paper, submitted to ACM TOMS, also exists, which proposes an -[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS: +[analytical model](http://dl.acm.org/citation.cfm?id=2925987) +([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)) +for determining blocksize parameters in BLIS: ``` @article{BLIS4, @@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an } ``` +A fifth paper, submitted to ACM TOMS, begins the study of so-called +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): + +``` +@article{BLIS5, + author = {Field G. {V}an~{Z}ee and Tyler Smith}, + title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods}, + journal = {ACM Transactions on Mathematical Software}, + year = {2017}, + note = {accepted} +} +``` + +A sixth paper, submitted to ACM TOMS, revisits the topic of the previous +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)): + +``` +@article{BLIS6, + author = {Field G. {V}an~{Z}ee}, + title = {Implementing high-performance complex matrix multiplication via the 1m method}, + journal = {ACM Transactions on Mathematical Software}, + note = {submitted} +} +``` + + Funding ------- diff --git a/kernels/armv8a/3/bli_gemm_opt_4x4.c b/kernels/armv8a/3/bli_gemm_opt_4x4.c index 992750b93..479c2b624 100644 --- a/kernels/armv8a/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/3/bli_gemm_opt_4x4.c @@ -50,16 +50,17 @@ * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_sgemm_opt_8x12( - dim_t k, - float* restrict alpha, - float* restrict a, - float* restrict b, - float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_sgemm_opt_8x12 + ( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -1100,16 +1101,17 @@ __asm__ volatile * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_dgemm_opt_6x8( - dim_t k, - double* restrict alpha, - double* restrict a, - double* restrict b, - double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_dgemm_opt_6x8 + ( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -2070,47 +2072,55 @@ __asm__ volatile } -void bli_cgemm_opt_4x4( - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a, - scomplex* restrict b, - scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_cgemm_opt_4x4 + ( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data, - cntx ); + BLIS_CGEMM_UKERNEL_REF + ( + k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data, + cntx + ); } -void bli_zgemm_opt_4x4( - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a, - dcomplex* restrict b, - dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_zgemm_opt_4x4 + ( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data, - cntx ); + BLIS_ZGEMM_UKERNEL_REF + ( + k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data, + cntx + ); }