Updated copyright headers of emscripten configuration files.

This commit is contained in:
Field G. Van Zee
2014-08-06 14:13:46 -05:00
parent 30833ed71d
commit 9526ce9881
13 changed files with 68 additions and 730 deletions

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

View File

@@ -4,7 +4,7 @@
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
# - Neither the name of The University of Texas at Austin nor the names
# of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

View File

@@ -48,7 +48,7 @@
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
#define BLIS_INT_TYPE_SIZE 64
@@ -99,7 +99,7 @@
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
#define BLIS_SIMD_ALIGN_SIZE 32
// Alignment size used to align local stack buffers within macro-kernel
// functions.
@@ -151,7 +151,7 @@
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _

View File

@@ -48,7 +48,7 @@
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
#define BLIS_INT_TYPE_SIZE 64
@@ -69,7 +69,7 @@
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 24
#define BLIS_MAX_NUM_THREADS 1
@@ -80,7 +80,7 @@
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS 1
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
@@ -151,7 +151,7 @@
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _

View File

@@ -54,12 +54,12 @@
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 64
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 384
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 96
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_MC_D 128
#define BLIS_DEFAULT_KC_D 384
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
@@ -70,10 +70,13 @@
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
//#define BLIS_DEFAULT_4M_MC_Z 128
//#define BLIS_DEFAULT_4M_KC_Z 128
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_NR_S 8
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
@@ -152,7 +155,10 @@
// -- gemm --
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8
//#define BLIS_DGEMM_UKERNEL bli_dgemm_int_8x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4
// -- trsm-related --

View File

@@ -85,6 +85,7 @@ CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O3 -march=native
#COPTFLAGS := -O1 -march=native
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse

View File

@@ -1,675 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <immintrin.h>
void bli_sgemm_opt_8x4_ref_u4_nodupl_avx1(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
BLIS_SGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data );
}
void bli_dgemm_opt_8x4_ref_u4_nodupl_avx1(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
//void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 2;
dim_t k_left = k % 2;
dim_t i;
double *c00, *c01, *c02, *c03;
double *c40, *c41, *c42, *c43;
// Quad registers.
__m256d va0_3, va4_7;
__m256d vA0_3, vA4_7;
__m256d vb0, vb1, vb2, vb3;
__m256d vb;
__m256d vB0;
__m256d va0_3b_0, va4_7b_0;
__m256d va0_3b_1, va4_7b_1;
__m256d va0_3b_2, va4_7b_2;
__m256d va0_3b_3, va4_7b_3;
__m256d va0_3b0, va4_7b0;
__m256d va0_3b1, va4_7b1;
__m256d va0_3b2, va4_7b2;
__m256d va0_3b3, va4_7b3;
__m256d valpha, vbeta, vtmp;
__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
__m128d aa, bb;
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) );
__asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) );
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) );
va0_3b0 = _mm256_setzero_pd();
va0_3b1 = _mm256_setzero_pd();
va0_3b2 = _mm256_setzero_pd();
va0_3b3 = _mm256_setzero_pd();
va4_7b0 = _mm256_setzero_pd();
va4_7b1 = _mm256_setzero_pd();
va4_7b2 = _mm256_setzero_pd();
va4_7b3 = _mm256_setzero_pd();
va0_3b_0 = _mm256_setzero_pd();
va0_3b_1 = _mm256_setzero_pd();
va0_3b_2 = _mm256_setzero_pd();
va0_3b_3 = _mm256_setzero_pd();
va4_7b_0 = _mm256_setzero_pd();
va4_7b_1 = _mm256_setzero_pd();
va4_7b_2 = _mm256_setzero_pd();
va4_7b_3 = _mm256_setzero_pd();
// Load va0_3
va0_3 = _mm256_load_pd( a );
// Load va4_7
va4_7 = _mm256_load_pd( a + 4 );
// Load vb (b0,b1,b2,b3)
vb0 = _mm256_load_pd( b );
for( i = 0; i < k_iter; ++i )
{
__asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) );
// Load va0_3 (Prefetch)
vA0_3 = _mm256_load_pd( a + 8 );
// Iteration 0.
vtmp = _mm256_mul_pd( va0_3, vb0 );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb0 );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
// Load va4_7 (Prefetch)
vA4_7 = _mm256_load_pd( a + 12 );
// Shuffle vb (b1,b0,b3,b2)
vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb1 );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb1 );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
// Permute vb (b3,b2,b1,b0)
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
// Load vb (b0,b1,b2,b3) (Prefetch)
vB0 = _mm256_load_pd( b + 4 );
vtmp = _mm256_mul_pd( va0_3, vb2 );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb2 );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Shuffle vb (b3,b2,b1,b0)
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb3 );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb3 );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
// Iteration 1.
__asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) );
// Load va0_3 (Next iteration)
va0_3 = _mm256_load_pd( a + 16 );
vtmp = _mm256_mul_pd( vA0_3, vB0 );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
vtmp = _mm256_mul_pd( vA4_7, vB0 );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
vtmp = _mm256_mul_pd( vA0_3, vb1 );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
// Load va4_7 (Next iteration)
va4_7 = _mm256_load_pd( a + 20 );
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
vtmp = _mm256_mul_pd( vA4_7, vb1 );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
vtmp = _mm256_mul_pd( vA0_3, vb2 );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
vtmp = _mm256_mul_pd( vA4_7, vb2 );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Load vb0(Next iteration)
vb0 = _mm256_load_pd( b + 8 );
vtmp = _mm256_mul_pd( vA0_3, vb3 );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( vA4_7, vb3 );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
a += 16;
b += 8;
}
for( i = 0; i < k_left; ++i )
{
// Iteration 0.
// Load va0_3
va0_3 = _mm256_load_pd( a );
// Load va4_7
va4_7 = _mm256_load_pd( a + 4 );
// Load vb (b0,b1,b2,b3)
vb = _mm256_load_pd( b );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
// Shuffle vb (b1,b0,b3,b2)
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
// Permute vb (b3,b2,b1,b0)
vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Shuffle vb (b3,b2,b1,b0)
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
a += 8;
b += 4;
}
vbeta = _mm256_broadcast_sd( beta );
__m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 );
__m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 );
__m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 );
__m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 );
__m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 );
__m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 );
__m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 );
__m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 );
valpha = _mm256_broadcast_sd( alpha );
va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 );
va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 );
va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 );
va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 );
va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 );
va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 );
va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
if( rs_c == 1 )
{
// Calculate address
c00 = ( c + 0*rs_c + 0*cs_c );
// Load
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
vc0_3_0 = _mm256_load_pd( c00 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b0);
// Scale by beta
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
// Add gemm result
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
// Store back to memory
_mm256_store_pd( c00, vc0_3_0 );
// Calculate address
c40 = ( c + 4*rs_c + 0*cs_c );
// Load
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
vc4_7_0 = _mm256_load_pd( c40 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b0);
// Scale by beta
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
// Add gemm result
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
// Store back to memory
_mm256_store_pd( c40, vc4_7_0 );
// Calculate address
c01 = ( c + 0*rs_c + 1*cs_c );
// Load
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
vc0_3_1 = _mm256_load_pd( c01 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b1);
// Scale by beta
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
// Add gemm result
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
// Store back to memory
_mm256_store_pd( c01, vc0_3_1 );
// Calculate address
c41 = ( c + 4*rs_c + 1*cs_c );
// Load
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
vc4_7_1 = _mm256_load_pd( c41 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b1);
// Scale by beta
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
// Add gemm result
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
// Store back to memory
_mm256_store_pd( c41, vc4_7_1 );
// Calculate address
c02 = ( c + 0*rs_c + 2*cs_c );
// Load
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
vc0_3_2 = _mm256_load_pd( c02 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b2);
// Scale by beta
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
// Add gemm result
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
// Store back to memory
_mm256_store_pd( c02, vc0_3_2 );
// Calculate address
c42 = ( c + 4*rs_c + 2*cs_c );
// Load
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
vc4_7_2 = _mm256_load_pd( c42 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b2);
// Scale by beta
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
// Add gemm result
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
// Store back to memory
_mm256_store_pd( c42, vc4_7_2 );
// Calculate address
c03 = ( c + 0*rs_c + 3*cs_c );
// Load
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
vc0_3_3 = _mm256_load_pd( c03 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b3);
// Scale by beta
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
// Add gemm result
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
// Store back to memory
_mm256_store_pd( c03, vc0_3_3 );
// Calculate address
c43 = ( c + 4*rs_c + 3*cs_c );
// Load
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
vc4_7_3 = _mm256_load_pd( c43 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b3);
// Scale by beta
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
// Add gemm result
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
// Store back to memory
_mm256_store_pd( c43, vc4_7_3 );
}
else
{
// Calculate address
c00 = ( c + 0*rs_c + 0*cs_c );
// Load
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),
*(c + 2*rs_c + 0*cs_c ),
*(c + 1*rs_c + 0*cs_c ),
*(c + 0*rs_c + 0*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b0);
// Scale by beta
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
// Add gemm result
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
// Store back to memory
//_mm256_store_pd( c00, vc0_3_0 );
aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
// Calculate address
c40 = ( c + 4*rs_c + 0*cs_c );
// Load
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),
*(c + 6*rs_c + 0*cs_c ),
*(c + 5*rs_c + 0*cs_c ),
*(c + 4*rs_c + 0*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b0);
// Scale by beta
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
// Add gemm result
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
// Store back to memory
//_mm256_store_pd( c40, vc4_7_0 );
aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
// Calculate address
c01 = ( c + 0*rs_c + 1*cs_c );
// Load
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),
*(c + 2*rs_c + 1*cs_c ),
*(c + 1*rs_c + 1*cs_c ),
*(c + 0*rs_c + 1*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b1);
// Scale by beta
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
// Add gemm result
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
// Store back to memory
//_mm256_store_pd( c01, vc0_3_1 );
aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
// Calculate address
c41 = ( c + 4*rs_c + 1*cs_c );
// Load
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),
*(c + 6*rs_c + 1*cs_c ),
*(c + 5*rs_c + 1*cs_c ),
*(c + 4*rs_c + 1*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b1);
// Scale by beta
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
// Add gemm result
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
// Store back to memory
//_mm256_store_pd( c41, vc4_7_1 );
aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
// Calculate address
c02 = ( c + 0*rs_c + 2*cs_c );
// Load
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),
*(c + 2*rs_c + 2*cs_c ),
*(c + 1*rs_c + 2*cs_c ),
*(c + 0*rs_c + 2*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b2);
// Scale by beta
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
// Add gemm result
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
// Store back to memory
//_mm256_store_pd( c02, vc0_3_2 );
aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
// Calculate address
c42 = ( c + 4*rs_c + 2*cs_c );
// Load
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),
*(c + 6*rs_c + 2*cs_c ),
*(c + 5*rs_c + 2*cs_c ),
*(c + 4*rs_c + 2*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b2);
// Scale by beta
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
// Add gemm result
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
// Store back to memory
//_mm256_store_pd( c42, vc4_7_2 );
aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
// Calculate address
c03 = ( c + 0*rs_c + 3*cs_c );
// Load
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),
*(c + 2*rs_c + 3*cs_c ),
*(c + 1*rs_c + 3*cs_c ),
*(c + 0*rs_c + 3*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b3);
// Scale by beta
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
// Add gemm result
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
// Store back to memory
//_mm256_store_pd( c03, vc0_3_3 );
aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
// Calculate address
c43 = ( c + 4*rs_c + 3*cs_c );
// Load
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),
*(c + 6*rs_c + 3*cs_c ),
*(c + 5*rs_c + 3*cs_c ),
*(c + 4*rs_c + 3*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b3);
// Scale by beta
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
// Add gemm result
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
// Store back to memory
//_mm256_store_pd( c43, vc4_7_3 );
aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
}
}
void bli_cgemm_opt_8x4_ref_u4_nodupl_avx1(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
BLIS_CGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data );
}
void bli_zgemm_opt_8x4_ref_u4_nodupl_avx1(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
BLIS_ZGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data );
}

View File

@@ -126,7 +126,7 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
# BLAS library path(s). This is where the BLAS libraries reside.
BLAS_LIB_PATH := $(HOME)/flame/lib
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64/
MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
# OpenBLAS
@@ -174,7 +174,7 @@ TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH)
LINKER := $(CC)
LDFLAGS := -L/home/00146/field/gnu/gcc-4.8.2/lib64
LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64
LDFLAGS += -lgfortran -lm -lpthread
@@ -187,7 +187,7 @@ LDFLAGS += -lgfortran -lm -lpthread
#
# blis openblas atlas mkl mac essl
#
all: blis openblas atlas mkl
all: blis openblas #mkl
blis: test_gemv_blis.x \
test_ger_blis.x \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -14,9 +14,9 @@
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -81,7 +81,11 @@ int main( int argc, char** argv )
#endif
#if 1
dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DOUBLE;
dt_a = BLIS_DOUBLE;
dt_b = BLIS_DOUBLE;
dt_c = BLIS_DOUBLE;
dt_alpha = BLIS_DOUBLE;
dt_beta = BLIS_DOUBLE;
#else
dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX;
#endif

View File

@@ -8,7 +8,7 @@
# accepted values.
#
1 # Number of repeats per experiment (best result is reported)
3 # Number of repeats per experiment (best result is reported)
c # Matrix storage scheme(s) to test:
# 'c' = col-major storage; 'g' = general stride storage;
# 'r' = row-major storage
@@ -17,12 +17,12 @@ c # Vector storage scheme(s) to test:
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
sdcz # Datatype(s) to test:
s #sdcz # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
100 # Problem size: first to test
300 # Problem size: maximum to test
100 # Problem size: increment between experiments
128 # Problem size: first to test
2048 # Problem size: maximum to test
128 # Problem size: increment between experiments
1 # Error-checking level:
# '0' = disable error checking; '1' = full error checking
i # Reaction to test failure:

View File

@@ -78,11 +78,11 @@
# --- Section overrides ----------------------------------------------------
1 # Utility
1 # Level-1v
1 # Level-1m
1 # Level-1f kernels
1 # Level-2
1 # Level-3 micro-kernels
0 # Level-1v
0 # Level-1m
0 # Level-1f kernels
0 # Level-2
0 # Level-3 micro-kernels
1 # Level-3
@@ -291,49 +291,49 @@
1 # gemm
1 # test sequential front-end
-1 -1 -1 # dimensions: m n k
?? # parameters: transa transb
nn # parameters: transa transb
1 # hemm
0 # hemm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa conja transb
1 # herk
0 # herk
1 # test sequential front-end
-1 -1 # dimensions: m k
?? # parameters: uploc transa
1 # her2k
0 # her2k
1 # test sequential front-end
-1 -1 # dimensions: m k
??? # parameters: uploc transa transb
1 # symm
0 # symm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa conja transb
1 # syrk
0 # syrk
1 # test sequential front-end
-1 -1 # dimensions: m k
?? # parameters: uploc transa
1 # syr2k
0 # syr2k
1 # test sequential front-end
-1 -1 # dimensions: m k
??? # parameters: uploc transa transb
1 # trmm
0 # trmm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa transa diaga
1 # trmm3
0 # trmm3
1 # test sequential front-end
-1 -1 # dimensions: m n
????? # parameters: side uploa transa diaga transb
1 # trsm
0 # trsm
1 # test sequential front-end
-1 -1 # dimensions: m n
???? # parameters: side uploa transa diaga

View File

@@ -178,8 +178,10 @@ void libblis_test_gemm_experiment( test_params_t* params,
}
else
{
bli_setsc( 1.2, 0.8, &alpha );
bli_setsc( -1.0, 1.0, &beta );
//bli_setsc( 1.2, 0.8, &alpha );
//bli_setsc( -1.0, 1.0, &beta );
bli_setsc( 1.2, 0.0, &alpha );
bli_setsc( -1.0, 0.0, &beta );
}
// Randomize A, B, and C, and save C.