mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
Merge branch 'master' into 1m
This commit is contained in:
30
README.md
30
README.md
@@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving
|
||||
```
|
||||
|
||||
A fourth paper, submitted to ACM TOMS, also exists, which proposes an
|
||||
[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS:
|
||||
[analytical model](http://dl.acm.org/citation.cfm?id=2925987)
|
||||
([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf))
|
||||
for determining blocksize parameters in BLIS:
|
||||
|
||||
```
|
||||
@article{BLIS4,
|
||||
@@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an
|
||||
}
|
||||
```
|
||||
|
||||
A fifth paper, submitted to ACM TOMS, begins the study of so-called
|
||||
[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)):
|
||||
|
||||
```
|
||||
@article{BLIS5,
|
||||
author = {Field G. {V}an~{Z}ee and Tyler Smith},
|
||||
title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods},
|
||||
journal = {ACM Transactions on Mathematical Software},
|
||||
year = {2017},
|
||||
note = {accepted}
|
||||
}
|
||||
```
|
||||
|
||||
A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
|
||||
article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)):
|
||||
|
||||
```
|
||||
@article{BLIS6,
|
||||
author = {Field G. {V}an~{Z}ee},
|
||||
title = {Implementing high-performance complex matrix multiplication via the 1m method},
|
||||
journal = {ACM Transactions on Mathematical Software},
|
||||
note = {submitted}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Funding
|
||||
-------
|
||||
|
||||
|
||||
@@ -50,16 +50,17 @@
|
||||
* Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz.
|
||||
* Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz.
|
||||
*/
|
||||
void bli_sgemm_opt_8x12(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
void bli_sgemm_opt_8x12
|
||||
(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
@@ -1100,16 +1101,17 @@ __asm__ volatile
|
||||
* Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz.
|
||||
* Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
|
||||
*/
|
||||
void bli_dgemm_opt_6x8(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
void bli_dgemm_opt_6x8
|
||||
(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
@@ -2070,47 +2072,55 @@ __asm__ volatile
|
||||
|
||||
}
|
||||
|
||||
void bli_cgemm_opt_4x4(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
void bli_cgemm_opt_4x4
|
||||
(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_CGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data,
|
||||
cntx );
|
||||
BLIS_CGEMM_UKERNEL_REF
|
||||
(
|
||||
k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
void bli_zgemm_opt_4x4(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
void bli_zgemm_opt_4x4
|
||||
(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_ZGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data,
|
||||
cntx );
|
||||
BLIS_ZGEMM_UKERNEL_REF
|
||||
(
|
||||
k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user