mirror of
https://github.com/amd/blis.git
synced 2026-06-06 04:34:02 +00:00
Updated copyright headers of emscripten configuration files.
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -14,9 +14,9 @@
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -14,9 +14,9 @@
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -14,9 +14,9 @@
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
# - Neither the name of The University of Texas at Austin nor the names
|
||||
# of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
#define BLIS_INT_TYPE_SIZE 64
|
||||
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
@@ -151,7 +151,7 @@
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
#define BLIS_INT_TYPE_SIZE 64
|
||||
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 24
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS 1
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
@@ -151,7 +151,7 @@
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
|
||||
@@ -54,12 +54,12 @@
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 384
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 96
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_MC_D 128
|
||||
#define BLIS_DEFAULT_KC_D 384
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
@@ -70,10 +70,13 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
//#define BLIS_DEFAULT_4M_MC_Z 128
|
||||
//#define BLIS_DEFAULT_4M_KC_Z 128
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 8
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
@@ -152,7 +155,10 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8
|
||||
|
||||
//#define BLIS_DGEMM_UKERNEL bli_dgemm_int_8x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
@@ -85,6 +85,7 @@ CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3 -march=native
|
||||
#COPTFLAGS := -O1 -march=native
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
|
||||
|
||||
|
||||
@@ -1,675 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
|
||||
void bli_sgemm_opt_8x4_ref_u4_nodupl_avx1(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_SGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dgemm_opt_8x4_ref_u4_nodupl_avx1(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 2;
|
||||
dim_t k_left = k % 2;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double *c00, *c01, *c02, *c03;
|
||||
double *c40, *c41, *c42, *c43;
|
||||
|
||||
// Quad registers.
|
||||
__m256d va0_3, va4_7;
|
||||
__m256d vA0_3, vA4_7;
|
||||
__m256d vb0, vb1, vb2, vb3;
|
||||
__m256d vb;
|
||||
__m256d vB0;
|
||||
|
||||
__m256d va0_3b_0, va4_7b_0;
|
||||
__m256d va0_3b_1, va4_7b_1;
|
||||
__m256d va0_3b_2, va4_7b_2;
|
||||
__m256d va0_3b_3, va4_7b_3;
|
||||
|
||||
__m256d va0_3b0, va4_7b0;
|
||||
__m256d va0_3b1, va4_7b1;
|
||||
__m256d va0_3b2, va4_7b2;
|
||||
__m256d va0_3b3, va4_7b3;
|
||||
|
||||
|
||||
__m256d valpha, vbeta, vtmp;
|
||||
__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
|
||||
__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
|
||||
|
||||
__m128d aa, bb;
|
||||
|
||||
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) );
|
||||
__asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) );
|
||||
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) );
|
||||
|
||||
va0_3b0 = _mm256_setzero_pd();
|
||||
va0_3b1 = _mm256_setzero_pd();
|
||||
va0_3b2 = _mm256_setzero_pd();
|
||||
va0_3b3 = _mm256_setzero_pd();
|
||||
|
||||
va4_7b0 = _mm256_setzero_pd();
|
||||
va4_7b1 = _mm256_setzero_pd();
|
||||
va4_7b2 = _mm256_setzero_pd();
|
||||
va4_7b3 = _mm256_setzero_pd();
|
||||
|
||||
va0_3b_0 = _mm256_setzero_pd();
|
||||
va0_3b_1 = _mm256_setzero_pd();
|
||||
va0_3b_2 = _mm256_setzero_pd();
|
||||
va0_3b_3 = _mm256_setzero_pd();
|
||||
|
||||
va4_7b_0 = _mm256_setzero_pd();
|
||||
va4_7b_1 = _mm256_setzero_pd();
|
||||
va4_7b_2 = _mm256_setzero_pd();
|
||||
va4_7b_3 = _mm256_setzero_pd();
|
||||
|
||||
// Load va0_3
|
||||
va0_3 = _mm256_load_pd( a );
|
||||
// Load va4_7
|
||||
va4_7 = _mm256_load_pd( a + 4 );
|
||||
|
||||
// Load vb (b0,b1,b2,b3)
|
||||
vb0 = _mm256_load_pd( b );
|
||||
|
||||
for( i = 0; i < k_iter; ++i )
|
||||
{
|
||||
__asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) );
|
||||
|
||||
// Load va0_3 (Prefetch)
|
||||
vA0_3 = _mm256_load_pd( a + 8 );
|
||||
|
||||
// Iteration 0.
|
||||
vtmp = _mm256_mul_pd( va0_3, vb0 );
|
||||
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb0 );
|
||||
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
||||
|
||||
// Load va4_7 (Prefetch)
|
||||
vA4_7 = _mm256_load_pd( a + 12 );
|
||||
|
||||
// Shuffle vb (b1,b0,b3,b2)
|
||||
vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb1 );
|
||||
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb1 );
|
||||
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
||||
|
||||
// Permute vb (b3,b2,b1,b0)
|
||||
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
|
||||
|
||||
// Load vb (b0,b1,b2,b3) (Prefetch)
|
||||
vB0 = _mm256_load_pd( b + 4 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb2 );
|
||||
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb2 );
|
||||
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
||||
|
||||
// Shuffle vb (b3,b2,b1,b0)
|
||||
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb3 );
|
||||
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb3 );
|
||||
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
||||
|
||||
// Iteration 1.
|
||||
|
||||
__asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) );
|
||||
|
||||
// Load va0_3 (Next iteration)
|
||||
va0_3 = _mm256_load_pd( a + 16 );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA0_3, vB0 );
|
||||
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
||||
|
||||
vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA4_7, vB0 );
|
||||
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA0_3, vb1 );
|
||||
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
||||
|
||||
// Load va4_7 (Next iteration)
|
||||
va4_7 = _mm256_load_pd( a + 20 );
|
||||
|
||||
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA4_7, vb1 );
|
||||
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA0_3, vb2 );
|
||||
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
||||
|
||||
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA4_7, vb2 );
|
||||
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
||||
|
||||
// Load vb0(Next iteration)
|
||||
vb0 = _mm256_load_pd( b + 8 );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA0_3, vb3 );
|
||||
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( vA4_7, vb3 );
|
||||
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
||||
|
||||
a += 16;
|
||||
b += 8;
|
||||
|
||||
}
|
||||
|
||||
for( i = 0; i < k_left; ++i )
|
||||
{
|
||||
// Iteration 0.
|
||||
|
||||
// Load va0_3
|
||||
va0_3 = _mm256_load_pd( a );
|
||||
// Load va4_7
|
||||
va4_7 = _mm256_load_pd( a + 4 );
|
||||
|
||||
// Load vb (b0,b1,b2,b3)
|
||||
vb = _mm256_load_pd( b );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb );
|
||||
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb );
|
||||
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
||||
|
||||
// Shuffle vb (b1,b0,b3,b2)
|
||||
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb );
|
||||
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb );
|
||||
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
||||
|
||||
// Permute vb (b3,b2,b1,b0)
|
||||
vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb );
|
||||
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb );
|
||||
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
||||
|
||||
// Shuffle vb (b3,b2,b1,b0)
|
||||
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
|
||||
|
||||
vtmp = _mm256_mul_pd( va0_3, vb );
|
||||
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
||||
|
||||
vtmp = _mm256_mul_pd( va4_7, vb );
|
||||
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
||||
|
||||
a += 8;
|
||||
b += 4;
|
||||
|
||||
}
|
||||
|
||||
vbeta = _mm256_broadcast_sd( beta );
|
||||
|
||||
__m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 );
|
||||
__m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 );
|
||||
|
||||
__m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 );
|
||||
__m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 );
|
||||
|
||||
__m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 );
|
||||
__m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 );
|
||||
|
||||
__m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 );
|
||||
__m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 );
|
||||
|
||||
valpha = _mm256_broadcast_sd( alpha );
|
||||
|
||||
va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 );
|
||||
va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 );
|
||||
|
||||
va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 );
|
||||
va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 );
|
||||
|
||||
va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 );
|
||||
va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 );
|
||||
|
||||
va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
|
||||
va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
|
||||
|
||||
if( rs_c == 1 )
|
||||
{
|
||||
// Calculate address
|
||||
c00 = ( c + 0*rs_c + 0*cs_c );
|
||||
// Load
|
||||
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
|
||||
vc0_3_0 = _mm256_load_pd( c00 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b0);
|
||||
// Scale by beta
|
||||
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
|
||||
// Add gemm result
|
||||
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c00, vc0_3_0 );
|
||||
|
||||
// Calculate address
|
||||
c40 = ( c + 4*rs_c + 0*cs_c );
|
||||
// Load
|
||||
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
|
||||
vc4_7_0 = _mm256_load_pd( c40 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b0);
|
||||
// Scale by beta
|
||||
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
|
||||
// Add gemm result
|
||||
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c40, vc4_7_0 );
|
||||
|
||||
// Calculate address
|
||||
c01 = ( c + 0*rs_c + 1*cs_c );
|
||||
// Load
|
||||
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
|
||||
vc0_3_1 = _mm256_load_pd( c01 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b1);
|
||||
// Scale by beta
|
||||
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
|
||||
// Add gemm result
|
||||
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c01, vc0_3_1 );
|
||||
|
||||
// Calculate address
|
||||
c41 = ( c + 4*rs_c + 1*cs_c );
|
||||
// Load
|
||||
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
|
||||
vc4_7_1 = _mm256_load_pd( c41 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b1);
|
||||
// Scale by beta
|
||||
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
|
||||
// Add gemm result
|
||||
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c41, vc4_7_1 );
|
||||
|
||||
// Calculate address
|
||||
c02 = ( c + 0*rs_c + 2*cs_c );
|
||||
// Load
|
||||
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
|
||||
vc0_3_2 = _mm256_load_pd( c02 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b2);
|
||||
// Scale by beta
|
||||
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
|
||||
// Add gemm result
|
||||
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c02, vc0_3_2 );
|
||||
|
||||
// Calculate address
|
||||
c42 = ( c + 4*rs_c + 2*cs_c );
|
||||
// Load
|
||||
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
|
||||
vc4_7_2 = _mm256_load_pd( c42 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b2);
|
||||
// Scale by beta
|
||||
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
|
||||
// Add gemm result
|
||||
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c42, vc4_7_2 );
|
||||
|
||||
// Calculate address
|
||||
c03 = ( c + 0*rs_c + 3*cs_c );
|
||||
// Load
|
||||
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
|
||||
vc0_3_3 = _mm256_load_pd( c03 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b3);
|
||||
// Scale by beta
|
||||
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
|
||||
// Add gemm result
|
||||
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c03, vc0_3_3 );
|
||||
|
||||
// Calculate address
|
||||
c43 = ( c + 4*rs_c + 3*cs_c );
|
||||
// Load
|
||||
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
|
||||
vc4_7_3 = _mm256_load_pd( c43 );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b3);
|
||||
// Scale by beta
|
||||
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
|
||||
// Add gemm result
|
||||
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
|
||||
// Store back to memory
|
||||
_mm256_store_pd( c43, vc4_7_3 );
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
// Calculate address
|
||||
c00 = ( c + 0*rs_c + 0*cs_c );
|
||||
// Load
|
||||
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
|
||||
vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),
|
||||
*(c + 2*rs_c + 0*cs_c ),
|
||||
*(c + 1*rs_c + 0*cs_c ),
|
||||
*(c + 0*rs_c + 0*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b0);
|
||||
// Scale by beta
|
||||
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
|
||||
// Add gemm result
|
||||
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c00, vc0_3_0 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
|
||||
_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
|
||||
_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
|
||||
_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c40 = ( c + 4*rs_c + 0*cs_c );
|
||||
// Load
|
||||
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
|
||||
vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),
|
||||
*(c + 6*rs_c + 0*cs_c ),
|
||||
*(c + 5*rs_c + 0*cs_c ),
|
||||
*(c + 4*rs_c + 0*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b0);
|
||||
// Scale by beta
|
||||
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
|
||||
// Add gemm result
|
||||
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c40, vc4_7_0 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
|
||||
_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
|
||||
_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
|
||||
_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c01 = ( c + 0*rs_c + 1*cs_c );
|
||||
// Load
|
||||
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
|
||||
vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),
|
||||
*(c + 2*rs_c + 1*cs_c ),
|
||||
*(c + 1*rs_c + 1*cs_c ),
|
||||
*(c + 0*rs_c + 1*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b1);
|
||||
// Scale by beta
|
||||
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
|
||||
// Add gemm result
|
||||
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c01, vc0_3_1 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
|
||||
_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
|
||||
_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
|
||||
_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c41 = ( c + 4*rs_c + 1*cs_c );
|
||||
// Load
|
||||
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
|
||||
vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),
|
||||
*(c + 6*rs_c + 1*cs_c ),
|
||||
*(c + 5*rs_c + 1*cs_c ),
|
||||
*(c + 4*rs_c + 1*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b1);
|
||||
// Scale by beta
|
||||
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
|
||||
// Add gemm result
|
||||
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c41, vc4_7_1 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
|
||||
_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
|
||||
_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
|
||||
_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c02 = ( c + 0*rs_c + 2*cs_c );
|
||||
// Load
|
||||
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
|
||||
vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),
|
||||
*(c + 2*rs_c + 2*cs_c ),
|
||||
*(c + 1*rs_c + 2*cs_c ),
|
||||
*(c + 0*rs_c + 2*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b2);
|
||||
// Scale by beta
|
||||
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
|
||||
// Add gemm result
|
||||
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c02, vc0_3_2 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
|
||||
_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
|
||||
_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
|
||||
_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c42 = ( c + 4*rs_c + 2*cs_c );
|
||||
// Load
|
||||
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
|
||||
vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),
|
||||
*(c + 6*rs_c + 2*cs_c ),
|
||||
*(c + 5*rs_c + 2*cs_c ),
|
||||
*(c + 4*rs_c + 2*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b2);
|
||||
// Scale by beta
|
||||
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
|
||||
// Add gemm result
|
||||
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c42, vc4_7_2 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
|
||||
_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
|
||||
_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
|
||||
_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c03 = ( c + 0*rs_c + 3*cs_c );
|
||||
// Load
|
||||
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
|
||||
vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),
|
||||
*(c + 2*rs_c + 3*cs_c ),
|
||||
*(c + 1*rs_c + 3*cs_c ),
|
||||
*(c + 0*rs_c + 3*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va0_3b3);
|
||||
// Scale by beta
|
||||
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
|
||||
// Add gemm result
|
||||
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c03, vc0_3_3 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
|
||||
_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
|
||||
_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
|
||||
_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
|
||||
|
||||
// Calculate address
|
||||
c43 = ( c + 4*rs_c + 3*cs_c );
|
||||
// Load
|
||||
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
|
||||
vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),
|
||||
*(c + 6*rs_c + 3*cs_c ),
|
||||
*(c + 5*rs_c + 3*cs_c ),
|
||||
*(c + 4*rs_c + 3*cs_c ) );
|
||||
// Scale by alpha
|
||||
vtmp = _mm256_mul_pd( valpha, va4_7b3);
|
||||
// Scale by beta
|
||||
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
|
||||
// Add gemm result
|
||||
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
|
||||
// Store back to memory
|
||||
//_mm256_store_pd( c43, vc4_7_3 );
|
||||
|
||||
aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
|
||||
bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
|
||||
|
||||
_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
|
||||
_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
|
||||
_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
|
||||
_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cgemm_opt_8x4_ref_u4_nodupl_avx1(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_CGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zgemm_opt_8x4_ref_u4_nodupl_avx1(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
BLIS_ZGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
|
||||
|
||||
# BLAS library path(s). This is where the BLAS libraries reside.
|
||||
BLAS_LIB_PATH := $(HOME)/flame/lib
|
||||
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64/
|
||||
MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
|
||||
|
||||
# OpenBLAS
|
||||
@@ -174,7 +174,7 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
|
||||
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -L/home/00146/field/gnu/gcc-4.8.2/lib64
|
||||
LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64
|
||||
LDFLAGS += -lgfortran -lm -lpthread
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ LDFLAGS += -lgfortran -lm -lpthread
|
||||
#
|
||||
# blis openblas atlas mkl mac essl
|
||||
#
|
||||
all: blis openblas atlas mkl
|
||||
all: blis openblas #mkl
|
||||
|
||||
blis: test_gemv_blis.x \
|
||||
test_ger_blis.x \
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -14,9 +14,9 @@
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
@@ -81,7 +81,11 @@ int main( int argc, char** argv )
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DOUBLE;
|
||||
dt_a = BLIS_DOUBLE;
|
||||
dt_b = BLIS_DOUBLE;
|
||||
dt_c = BLIS_DOUBLE;
|
||||
dt_alpha = BLIS_DOUBLE;
|
||||
dt_beta = BLIS_DOUBLE;
|
||||
#else
|
||||
dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# accepted values.
|
||||
#
|
||||
|
||||
1 # Number of repeats per experiment (best result is reported)
|
||||
3 # Number of repeats per experiment (best result is reported)
|
||||
c # Matrix storage scheme(s) to test:
|
||||
# 'c' = col-major storage; 'g' = general stride storage;
|
||||
# 'r' = row-major storage
|
||||
@@ -17,12 +17,12 @@ c # Vector storage scheme(s) to test:
|
||||
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
|
||||
0 # Test all combinations of storage schemes?
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
sdcz # Datatype(s) to test:
|
||||
s #sdcz # Datatype(s) to test:
|
||||
# 's' = single real; 'c' = single complex;
|
||||
# 'd' = double real; 'z' = double complex
|
||||
100 # Problem size: first to test
|
||||
300 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
128 # Problem size: first to test
|
||||
2048 # Problem size: maximum to test
|
||||
128 # Problem size: increment between experiments
|
||||
1 # Error-checking level:
|
||||
# '0' = disable error checking; '1' = full error checking
|
||||
i # Reaction to test failure:
|
||||
|
||||
@@ -78,11 +78,11 @@
|
||||
# --- Section overrides ----------------------------------------------------
|
||||
|
||||
1 # Utility
|
||||
1 # Level-1v
|
||||
1 # Level-1m
|
||||
1 # Level-1f kernels
|
||||
1 # Level-2
|
||||
1 # Level-3 micro-kernels
|
||||
0 # Level-1v
|
||||
0 # Level-1m
|
||||
0 # Level-1f kernels
|
||||
0 # Level-2
|
||||
0 # Level-3 micro-kernels
|
||||
1 # Level-3
|
||||
|
||||
|
||||
@@ -291,49 +291,49 @@
|
||||
1 # gemm
|
||||
1 # test sequential front-end
|
||||
-1 -1 -1 # dimensions: m n k
|
||||
?? # parameters: transa transb
|
||||
nn # parameters: transa transb
|
||||
|
||||
1 # hemm
|
||||
0 # hemm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa conja transb
|
||||
|
||||
1 # herk
|
||||
0 # herk
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m k
|
||||
?? # parameters: uploc transa
|
||||
|
||||
1 # her2k
|
||||
0 # her2k
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m k
|
||||
??? # parameters: uploc transa transb
|
||||
|
||||
1 # symm
|
||||
0 # symm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa conja transb
|
||||
|
||||
1 # syrk
|
||||
0 # syrk
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m k
|
||||
?? # parameters: uploc transa
|
||||
|
||||
1 # syr2k
|
||||
0 # syr2k
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m k
|
||||
??? # parameters: uploc transa transb
|
||||
|
||||
1 # trmm
|
||||
0 # trmm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa transa diaga
|
||||
|
||||
1 # trmm3
|
||||
0 # trmm3
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
????? # parameters: side uploa transa diaga transb
|
||||
|
||||
1 # trsm
|
||||
0 # trsm
|
||||
1 # test sequential front-end
|
||||
-1 -1 # dimensions: m n
|
||||
???? # parameters: side uploa transa diaga
|
||||
|
||||
@@ -178,8 +178,10 @@ void libblis_test_gemm_experiment( test_params_t* params,
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_setsc( 1.2, 0.8, &alpha );
|
||||
bli_setsc( -1.0, 1.0, &beta );
|
||||
//bli_setsc( 1.2, 0.8, &alpha );
|
||||
//bli_setsc( -1.0, 1.0, &beta );
|
||||
bli_setsc( 1.2, 0.0, &alpha );
|
||||
bli_setsc( -1.0, 0.0, &beta );
|
||||
}
|
||||
|
||||
// Randomize A, B, and C, and save C.
|
||||
|
||||
Reference in New Issue
Block a user