Merge branch 'master' into 1m

This commit is contained in:
Field G. Van Zee
2017-05-02 12:30:28 -05:00
2 changed files with 95 additions and 57 deletions

View File

@@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving
```
A fourth paper, submitted to ACM TOMS, also exists, which proposes an
[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS:
[analytical model](http://dl.acm.org/citation.cfm?id=2925987)
([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf))
for determining blocksize parameters in BLIS:
```
@article{BLIS4,
@@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an
}
```
A fifth paper, submitted to ACM TOMS, begins the study of so-called
[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)):
```
@article{BLIS5,
author = {Field G. {V}an~{Z}ee and Tyler Smith},
title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods},
journal = {ACM Transactions on Mathematical Software},
year = {2017},
note = {accepted}
}
```
A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)):
```
@article{BLIS6,
author = {Field G. {V}an~{Z}ee},
title = {Implementing high-performance complex matrix multiplication via the 1m method},
journal = {ACM Transactions on Mathematical Software},
note = {submitted}
}
```
Funding
-------

View File

@@ -50,16 +50,17 @@
* Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz.
* Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz.
*/
void bli_sgemm_opt_8x12(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data,
cntx_t* restrict cntx
)
void bli_sgemm_opt_8x12
(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
@@ -1100,16 +1101,17 @@ __asm__ volatile
* Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz.
* Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
*/
void bli_dgemm_opt_6x8(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data,
cntx_t* restrict cntx
)
void bli_dgemm_opt_6x8
(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
@@ -2070,47 +2072,55 @@ __asm__ volatile
}
void bli_cgemm_opt_4x4(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data,
cntx_t* restrict cntx
)
void bli_cgemm_opt_4x4
(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/* Just call the reference implementation. */
BLIS_CGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data,
cntx );
BLIS_CGEMM_UKERNEL_REF
(
k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data,
cntx
);
}
void bli_zgemm_opt_4x4(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data,
cntx_t* restrict cntx
)
void bli_zgemm_opt_4x4
(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/* Just call the reference implementation. */
BLIS_ZGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data,
cntx );
BLIS_ZGEMM_UKERNEL_REF
(
k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data,
cntx
);
}