mirror of
https://github.com/amd/blis.git
synced 2026-05-11 01:30:00 +00:00
Details: - Inserted missing safeguards into most microkernels to ensure that the integers read by the microkernel's assembly instructions are of the appropriate size. In many cases, this bug was going undetected likely because the compiler was inserting zero padding before the integers in the calling function, allowing the assembly code to read 64-bits in a way that did not corrupt the "lower" 32 integer bits with garbage in the higher bits. Thanks to Francisco Igual and Devangi Parikh for finding this issue.
668 lines
20 KiB
C
668 lines
20 KiB
C
/*
|
|
|
|
BLIS
|
|
An object-based framework for developing high-performance BLAS-like
|
|
libraries.
|
|
|
|
Copyright (C) 2014, The University of Texas at Austin
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
- Neither the name of The University of Texas at Austin nor the names
|
|
of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "blis.h"
|
|
#include <immintrin.h>
|
|
|
|
|
|
#if 0
|
|
void bli_sgemm_sandybridge_int_8x8
|
|
(
|
|
dim_t k0,
|
|
float* restrict alpha,
|
|
float* restrict a,
|
|
float* restrict b,
|
|
float* restrict beta,
|
|
float* restrict c, inc_t rs_c0, inc_t cs_c0,
|
|
auxinfo_t* restrict data,
|
|
cntx_t* restrict cntx
|
|
)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
void bli_dgemm_sandybridge_int_8x4
|
|
(
|
|
dim_t k0,
|
|
double* restrict alpha,
|
|
double* restrict a,
|
|
double* restrict b,
|
|
double* restrict beta,
|
|
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
|
auxinfo_t* restrict data,
|
|
cntx_t* restrict cntx
|
|
)
|
|
{
|
|
//void* a_next = bli_auxinfo_next_a( data );
|
|
void* b_next = bli_auxinfo_next_b( data );
|
|
|
|
// Typecast local copies of integers in case dim_t and inc_t are a
|
|
// different size than is expected by load instructions.
|
|
uint64_t k_iter = k0 / 2;
|
|
uint64_t k_left = k0 % 2;
|
|
uint64_t rs_c = rs_c0;
|
|
uint64_t cs_c = cs_c0;
|
|
uint64_t i;
|
|
|
|
double *c00, *c01, *c02, *c03;
|
|
double *c40, *c41, *c42, *c43;
|
|
|
|
// Quad registers.
|
|
__m256d va0_3, va4_7;
|
|
__m256d vA0_3, vA4_7;
|
|
__m256d vb0, vb1, vb2, vb3;
|
|
__m256d vb;
|
|
__m256d vB0;
|
|
|
|
__m256d va0_3b_0, va4_7b_0;
|
|
__m256d va0_3b_1, va4_7b_1;
|
|
__m256d va0_3b_2, va4_7b_2;
|
|
__m256d va0_3b_3, va4_7b_3;
|
|
|
|
__m256d va0_3b0, va4_7b0;
|
|
__m256d va0_3b1, va4_7b1;
|
|
__m256d va0_3b2, va4_7b2;
|
|
__m256d va0_3b3, va4_7b3;
|
|
|
|
|
|
__m256d valpha, vbeta, vtmp;
|
|
__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
|
|
__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
|
|
|
|
__m128d aa, bb;
|
|
|
|
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) );
|
|
__asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) );
|
|
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) );
|
|
|
|
va0_3b0 = _mm256_setzero_pd();
|
|
va0_3b1 = _mm256_setzero_pd();
|
|
va0_3b2 = _mm256_setzero_pd();
|
|
va0_3b3 = _mm256_setzero_pd();
|
|
|
|
va4_7b0 = _mm256_setzero_pd();
|
|
va4_7b1 = _mm256_setzero_pd();
|
|
va4_7b2 = _mm256_setzero_pd();
|
|
va4_7b3 = _mm256_setzero_pd();
|
|
|
|
va0_3b_0 = _mm256_setzero_pd();
|
|
va0_3b_1 = _mm256_setzero_pd();
|
|
va0_3b_2 = _mm256_setzero_pd();
|
|
va0_3b_3 = _mm256_setzero_pd();
|
|
|
|
va4_7b_0 = _mm256_setzero_pd();
|
|
va4_7b_1 = _mm256_setzero_pd();
|
|
va4_7b_2 = _mm256_setzero_pd();
|
|
va4_7b_3 = _mm256_setzero_pd();
|
|
|
|
// Load va0_3
|
|
va0_3 = _mm256_load_pd( a );
|
|
// Load va4_7
|
|
va4_7 = _mm256_load_pd( a + 4 );
|
|
|
|
// Load vb (b0,b1,b2,b3)
|
|
vb0 = _mm256_load_pd( b );
|
|
|
|
for( i = 0; i < k_iter; ++i )
|
|
{
|
|
__asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) );
|
|
|
|
// Load va0_3 (Prefetch)
|
|
vA0_3 = _mm256_load_pd( a + 8 );
|
|
|
|
// Iteration 0.
|
|
vtmp = _mm256_mul_pd( va0_3, vb0 );
|
|
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb0 );
|
|
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
|
|
|
// Load va4_7 (Prefetch)
|
|
vA4_7 = _mm256_load_pd( a + 12 );
|
|
|
|
// Shuffle vb (b1,b0,b3,b2)
|
|
vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb1 );
|
|
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb1 );
|
|
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
|
|
|
// Permute vb (b3,b2,b1,b0)
|
|
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
|
|
|
|
// Load vb (b0,b1,b2,b3) (Prefetch)
|
|
vB0 = _mm256_load_pd( b + 4 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb2 );
|
|
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb2 );
|
|
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
|
|
|
// Shuffle vb (b3,b2,b1,b0)
|
|
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb3 );
|
|
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb3 );
|
|
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
|
|
|
// Iteration 1.
|
|
|
|
__asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) );
|
|
|
|
// Load va0_3 (Next iteration)
|
|
va0_3 = _mm256_load_pd( a + 16 );
|
|
|
|
vtmp = _mm256_mul_pd( vA0_3, vB0 );
|
|
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
|
|
|
vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( vA4_7, vB0 );
|
|
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( vA0_3, vb1 );
|
|
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
|
|
|
// Load va4_7 (Next iteration)
|
|
va4_7 = _mm256_load_pd( a + 20 );
|
|
|
|
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
|
|
|
|
vtmp = _mm256_mul_pd( vA4_7, vb1 );
|
|
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( vA0_3, vb2 );
|
|
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
|
|
|
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( vA4_7, vb2 );
|
|
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
|
|
|
// Load vb0(Next iteration)
|
|
vb0 = _mm256_load_pd( b + 8 );
|
|
|
|
vtmp = _mm256_mul_pd( vA0_3, vb3 );
|
|
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( vA4_7, vb3 );
|
|
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
|
|
|
a += 16;
|
|
b += 8;
|
|
|
|
}
|
|
|
|
for( i = 0; i < k_left; ++i )
|
|
{
|
|
// Iteration 0.
|
|
|
|
// Load va0_3
|
|
va0_3 = _mm256_load_pd( a );
|
|
// Load va4_7
|
|
va4_7 = _mm256_load_pd( a + 4 );
|
|
|
|
// Load vb (b0,b1,b2,b3)
|
|
vb = _mm256_load_pd( b );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb );
|
|
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb );
|
|
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
|
|
|
|
// Shuffle vb (b1,b0,b3,b2)
|
|
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb );
|
|
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb );
|
|
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
|
|
|
|
// Permute vb (b3,b2,b1,b0)
|
|
vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb );
|
|
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb );
|
|
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
|
|
|
|
// Shuffle vb (b3,b2,b1,b0)
|
|
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
|
|
|
|
vtmp = _mm256_mul_pd( va0_3, vb );
|
|
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
|
|
|
|
vtmp = _mm256_mul_pd( va4_7, vb );
|
|
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
|
|
|
|
a += 8;
|
|
b += 4;
|
|
|
|
}
|
|
|
|
vbeta = _mm256_broadcast_sd( beta );
|
|
|
|
__m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 );
|
|
__m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 );
|
|
|
|
__m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 );
|
|
__m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 );
|
|
|
|
__m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 );
|
|
__m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 );
|
|
|
|
__m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 );
|
|
__m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 );
|
|
|
|
valpha = _mm256_broadcast_sd( alpha );
|
|
|
|
va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 );
|
|
va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 );
|
|
|
|
va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 );
|
|
va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 );
|
|
|
|
va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 );
|
|
va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 );
|
|
|
|
va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
|
|
va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
|
|
|
|
if( rs_c == 1 )
|
|
{
|
|
// Calculate address
|
|
c00 = ( c + 0*rs_c + 0*cs_c );
|
|
// Load
|
|
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
|
|
vc0_3_0 = _mm256_load_pd( c00 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b0);
|
|
// Scale by beta
|
|
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
|
|
// Add gemm result
|
|
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c00, vc0_3_0 );
|
|
|
|
// Calculate address
|
|
c40 = ( c + 4*rs_c + 0*cs_c );
|
|
// Load
|
|
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
|
|
vc4_7_0 = _mm256_load_pd( c40 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b0);
|
|
// Scale by beta
|
|
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
|
|
// Add gemm result
|
|
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c40, vc4_7_0 );
|
|
|
|
// Calculate address
|
|
c01 = ( c + 0*rs_c + 1*cs_c );
|
|
// Load
|
|
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
|
|
vc0_3_1 = _mm256_load_pd( c01 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b1);
|
|
// Scale by beta
|
|
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
|
|
// Add gemm result
|
|
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c01, vc0_3_1 );
|
|
|
|
// Calculate address
|
|
c41 = ( c + 4*rs_c + 1*cs_c );
|
|
// Load
|
|
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
|
|
vc4_7_1 = _mm256_load_pd( c41 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b1);
|
|
// Scale by beta
|
|
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
|
|
// Add gemm result
|
|
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c41, vc4_7_1 );
|
|
|
|
// Calculate address
|
|
c02 = ( c + 0*rs_c + 2*cs_c );
|
|
// Load
|
|
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
|
|
vc0_3_2 = _mm256_load_pd( c02 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b2);
|
|
// Scale by beta
|
|
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
|
|
// Add gemm result
|
|
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c02, vc0_3_2 );
|
|
|
|
// Calculate address
|
|
c42 = ( c + 4*rs_c + 2*cs_c );
|
|
// Load
|
|
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
|
|
vc4_7_2 = _mm256_load_pd( c42 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b2);
|
|
// Scale by beta
|
|
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
|
|
// Add gemm result
|
|
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c42, vc4_7_2 );
|
|
|
|
// Calculate address
|
|
c03 = ( c + 0*rs_c + 3*cs_c );
|
|
// Load
|
|
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
|
|
vc0_3_3 = _mm256_load_pd( c03 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b3);
|
|
// Scale by beta
|
|
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
|
|
// Add gemm result
|
|
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c03, vc0_3_3 );
|
|
|
|
// Calculate address
|
|
c43 = ( c + 4*rs_c + 3*cs_c );
|
|
// Load
|
|
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
|
|
vc4_7_3 = _mm256_load_pd( c43 );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b3);
|
|
// Scale by beta
|
|
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
|
|
// Add gemm result
|
|
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
|
|
// Store back to memory
|
|
_mm256_store_pd( c43, vc4_7_3 );
|
|
|
|
}
|
|
else
|
|
{
|
|
// Calculate address
|
|
c00 = ( c + 0*rs_c + 0*cs_c );
|
|
// Load
|
|
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
|
|
vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),
|
|
*(c + 2*rs_c + 0*cs_c ),
|
|
*(c + 1*rs_c + 0*cs_c ),
|
|
*(c + 0*rs_c + 0*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b0);
|
|
// Scale by beta
|
|
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
|
|
// Add gemm result
|
|
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c00, vc0_3_0 );
|
|
|
|
aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
|
|
_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
|
|
_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
|
|
_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c40 = ( c + 4*rs_c + 0*cs_c );
|
|
// Load
|
|
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
|
|
vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),
|
|
*(c + 6*rs_c + 0*cs_c ),
|
|
*(c + 5*rs_c + 0*cs_c ),
|
|
*(c + 4*rs_c + 0*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b0);
|
|
// Scale by beta
|
|
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
|
|
// Add gemm result
|
|
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c40, vc4_7_0 );
|
|
|
|
aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
|
|
_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
|
|
_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
|
|
_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c01 = ( c + 0*rs_c + 1*cs_c );
|
|
// Load
|
|
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
|
|
vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),
|
|
*(c + 2*rs_c + 1*cs_c ),
|
|
*(c + 1*rs_c + 1*cs_c ),
|
|
*(c + 0*rs_c + 1*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b1);
|
|
// Scale by beta
|
|
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
|
|
// Add gemm result
|
|
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c01, vc0_3_1 );
|
|
|
|
aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
|
|
_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
|
|
_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
|
|
_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c41 = ( c + 4*rs_c + 1*cs_c );
|
|
// Load
|
|
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
|
|
vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),
|
|
*(c + 6*rs_c + 1*cs_c ),
|
|
*(c + 5*rs_c + 1*cs_c ),
|
|
*(c + 4*rs_c + 1*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b1);
|
|
// Scale by beta
|
|
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
|
|
// Add gemm result
|
|
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c41, vc4_7_1 );
|
|
|
|
aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
|
|
_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
|
|
_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
|
|
_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c02 = ( c + 0*rs_c + 2*cs_c );
|
|
// Load
|
|
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
|
|
vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),
|
|
*(c + 2*rs_c + 2*cs_c ),
|
|
*(c + 1*rs_c + 2*cs_c ),
|
|
*(c + 0*rs_c + 2*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b2);
|
|
// Scale by beta
|
|
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
|
|
// Add gemm result
|
|
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c02, vc0_3_2 );
|
|
|
|
aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
|
|
_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
|
|
_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
|
|
_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c42 = ( c + 4*rs_c + 2*cs_c );
|
|
// Load
|
|
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
|
|
vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),
|
|
*(c + 6*rs_c + 2*cs_c ),
|
|
*(c + 5*rs_c + 2*cs_c ),
|
|
*(c + 4*rs_c + 2*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b2);
|
|
// Scale by beta
|
|
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
|
|
// Add gemm result
|
|
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c42, vc4_7_2 );
|
|
|
|
aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
|
|
_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
|
|
_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
|
|
_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c03 = ( c + 0*rs_c + 3*cs_c );
|
|
// Load
|
|
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
|
|
vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),
|
|
*(c + 2*rs_c + 3*cs_c ),
|
|
*(c + 1*rs_c + 3*cs_c ),
|
|
*(c + 0*rs_c + 3*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va0_3b3);
|
|
// Scale by beta
|
|
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
|
|
// Add gemm result
|
|
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c03, vc0_3_3 );
|
|
|
|
aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
|
|
_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
|
|
_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
|
|
_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
|
|
|
|
// Calculate address
|
|
c43 = ( c + 4*rs_c + 3*cs_c );
|
|
// Load
|
|
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
|
|
vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),
|
|
*(c + 6*rs_c + 3*cs_c ),
|
|
*(c + 5*rs_c + 3*cs_c ),
|
|
*(c + 4*rs_c + 3*cs_c ) );
|
|
// Scale by alpha
|
|
vtmp = _mm256_mul_pd( valpha, va4_7b3);
|
|
// Scale by beta
|
|
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
|
|
// Add gemm result
|
|
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
|
|
// Store back to memory
|
|
//_mm256_store_pd( c43, vc4_7_3 );
|
|
|
|
aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
|
|
bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
|
|
|
|
_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
|
|
_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
|
|
_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
|
|
_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
void bli_cgemm_sandybridge_int_8x4
|
|
(
|
|
dim_t k0,
|
|
scomplex* restrict alpha,
|
|
scomplex* restrict a,
|
|
scomplex* restrict b,
|
|
scomplex* restrict beta,
|
|
scomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
|
|
auxinfo_t* restrict data,
|
|
cntx_t* restrict cntx
|
|
)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
#if 0
|
|
void bli_zgemm_sandybridge_int_4x4
|
|
(
|
|
dim_t k0,
|
|
dcomplex* restrict alpha,
|
|
dcomplex* restrict a,
|
|
dcomplex* restrict b,
|
|
dcomplex* restrict beta,
|
|
dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
|
|
auxinfo_t* restrict data,
|
|
cntx_t* restrict cntx
|
|
)
|
|
{
|
|
}
|
|
#endif
|
|
|