Files
blis/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
Field G. Van Zee 0645f239fb Remove UT-Austin from copyright headers' clause 3.
Details:
- Removed explicit reference to The University of Texas at Austin in the
  third clause of the license comment blocks of all relevant files and
  replaced it with a more all-encompassing "copyright holder(s)".
- Removed duplicate words ("derived") from a few kernels' license
  comment blocks.
- Homogenized license comment block in kernels/zen/3/bli_gemm_small.c
  with format of all other comment blocks.
2018-12-04 14:31:06 -06:00

668 lines
20 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <immintrin.h>
#include "blis.h"
#if 0
void bli_sgemm_sandybridge_int_8x8
(
dim_t k0,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
}
#endif
void bli_dgemm_sandybridge_int_8x4
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
//void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 2;
uint64_t k_left = k0 % 2;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
uint64_t i;
double *c00, *c01, *c02, *c03;
double *c40, *c41, *c42, *c43;
// Quad registers.
__m256d va0_3, va4_7;
__m256d vA0_3, vA4_7;
__m256d vb0, vb1, vb2, vb3;
__m256d vb;
__m256d vB0;
__m256d va0_3b_0, va4_7b_0;
__m256d va0_3b_1, va4_7b_1;
__m256d va0_3b_2, va4_7b_2;
__m256d va0_3b_3, va4_7b_3;
__m256d va0_3b0, va4_7b0;
__m256d va0_3b1, va4_7b1;
__m256d va0_3b2, va4_7b2;
__m256d va0_3b3, va4_7b3;
__m256d valpha, vbeta, vtmp;
__m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3;
__m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3;
__m128d aa, bb;
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) );
__asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) );
__asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) );
va0_3b0 = _mm256_setzero_pd();
va0_3b1 = _mm256_setzero_pd();
va0_3b2 = _mm256_setzero_pd();
va0_3b3 = _mm256_setzero_pd();
va4_7b0 = _mm256_setzero_pd();
va4_7b1 = _mm256_setzero_pd();
va4_7b2 = _mm256_setzero_pd();
va4_7b3 = _mm256_setzero_pd();
va0_3b_0 = _mm256_setzero_pd();
va0_3b_1 = _mm256_setzero_pd();
va0_3b_2 = _mm256_setzero_pd();
va0_3b_3 = _mm256_setzero_pd();
va4_7b_0 = _mm256_setzero_pd();
va4_7b_1 = _mm256_setzero_pd();
va4_7b_2 = _mm256_setzero_pd();
va4_7b_3 = _mm256_setzero_pd();
// Load va0_3
va0_3 = _mm256_load_pd( a );
// Load va4_7
va4_7 = _mm256_load_pd( a + 4 );
// Load vb (b0,b1,b2,b3)
vb0 = _mm256_load_pd( b );
for( i = 0; i < k_iter; ++i )
{
__asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) );
// Load va0_3 (Prefetch)
vA0_3 = _mm256_load_pd( a + 8 );
// Iteration 0.
vtmp = _mm256_mul_pd( va0_3, vb0 );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb0 );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
// Load va4_7 (Prefetch)
vA4_7 = _mm256_load_pd( a + 12 );
// Shuffle vb (b1,b0,b3,b2)
vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb1 );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb1 );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
// Permute vb (b3,b2,b1,b0)
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
// Load vb (b0,b1,b2,b3) (Prefetch)
vB0 = _mm256_load_pd( b + 4 );
vtmp = _mm256_mul_pd( va0_3, vb2 );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb2 );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Shuffle vb (b3,b2,b1,b0)
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb3 );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb3 );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
// Iteration 1.
__asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) );
// Load va0_3 (Next iteration)
va0_3 = _mm256_load_pd( a + 16 );
vtmp = _mm256_mul_pd( vA0_3, vB0 );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 );
vtmp = _mm256_mul_pd( vA4_7, vB0 );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
vtmp = _mm256_mul_pd( vA0_3, vb1 );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
// Load va4_7 (Next iteration)
va4_7 = _mm256_load_pd( a + 20 );
vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 );
vtmp = _mm256_mul_pd( vA4_7, vb1 );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
vtmp = _mm256_mul_pd( vA0_3, vb2 );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 );
vtmp = _mm256_mul_pd( vA4_7, vb2 );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Load vb0(Next iteration)
vb0 = _mm256_load_pd( b + 8 );
vtmp = _mm256_mul_pd( vA0_3, vb3 );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( vA4_7, vb3 );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
a += 16;
b += 8;
}
for( i = 0; i < k_left; ++i )
{
// Iteration 0.
// Load va0_3
va0_3 = _mm256_load_pd( a );
// Load va4_7
va4_7 = _mm256_load_pd( a + 4 );
// Load vb (b0,b1,b2,b3)
vb = _mm256_load_pd( b );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp );
// Shuffle vb (b1,b0,b3,b2)
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp );
// Permute vb (b3,b2,b1,b0)
vb = _mm256_permute2f128_pd( vb, vb, 0x1 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp );
// Shuffle vb (b3,b2,b1,b0)
vb = _mm256_shuffle_pd( vb, vb, 0x5 );
vtmp = _mm256_mul_pd( va0_3, vb );
va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp );
vtmp = _mm256_mul_pd( va4_7, vb );
va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp );
a += 8;
b += 4;
}
vbeta = _mm256_broadcast_sd( beta );
__m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 );
__m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 );
__m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 );
__m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 );
__m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 );
__m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 );
__m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 );
__m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 );
valpha = _mm256_broadcast_sd( alpha );
va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 );
va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 );
va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 );
va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 );
va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 );
va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 );
va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 );
va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 );
if( rs_c == 1 )
{
// Calculate address
c00 = ( c + 0*rs_c + 0*cs_c );
// Load
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
vc0_3_0 = _mm256_load_pd( c00 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b0);
// Scale by beta
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
// Add gemm result
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
// Store back to memory
_mm256_store_pd( c00, vc0_3_0 );
// Calculate address
c40 = ( c + 4*rs_c + 0*cs_c );
// Load
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
vc4_7_0 = _mm256_load_pd( c40 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b0);
// Scale by beta
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
// Add gemm result
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
// Store back to memory
_mm256_store_pd( c40, vc4_7_0 );
// Calculate address
c01 = ( c + 0*rs_c + 1*cs_c );
// Load
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
vc0_3_1 = _mm256_load_pd( c01 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b1);
// Scale by beta
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
// Add gemm result
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
// Store back to memory
_mm256_store_pd( c01, vc0_3_1 );
// Calculate address
c41 = ( c + 4*rs_c + 1*cs_c );
// Load
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
vc4_7_1 = _mm256_load_pd( c41 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b1);
// Scale by beta
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
// Add gemm result
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
// Store back to memory
_mm256_store_pd( c41, vc4_7_1 );
// Calculate address
c02 = ( c + 0*rs_c + 2*cs_c );
// Load
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
vc0_3_2 = _mm256_load_pd( c02 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b2);
// Scale by beta
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
// Add gemm result
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
// Store back to memory
_mm256_store_pd( c02, vc0_3_2 );
// Calculate address
c42 = ( c + 4*rs_c + 2*cs_c );
// Load
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
vc4_7_2 = _mm256_load_pd( c42 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b2);
// Scale by beta
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
// Add gemm result
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
// Store back to memory
_mm256_store_pd( c42, vc4_7_2 );
// Calculate address
c03 = ( c + 0*rs_c + 3*cs_c );
// Load
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
vc0_3_3 = _mm256_load_pd( c03 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b3);
// Scale by beta
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
// Add gemm result
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
// Store back to memory
_mm256_store_pd( c03, vc0_3_3 );
// Calculate address
c43 = ( c + 4*rs_c + 3*cs_c );
// Load
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
vc4_7_3 = _mm256_load_pd( c43 );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b3);
// Scale by beta
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
// Add gemm result
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
// Store back to memory
_mm256_store_pd( c43, vc4_7_3 );
}
else
{
// Calculate address
c00 = ( c + 0*rs_c + 0*cs_c );
// Load
//vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c );
vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ),
*(c + 2*rs_c + 0*cs_c ),
*(c + 1*rs_c + 0*cs_c ),
*(c + 0*rs_c + 0*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b0);
// Scale by beta
vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 );
// Add gemm result
vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp );
// Store back to memory
//_mm256_store_pd( c00, vc0_3_0 );
aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 0*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 0*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb );
// Calculate address
c40 = ( c + 4*rs_c + 0*cs_c );
// Load
//vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c );
vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ),
*(c + 6*rs_c + 0*cs_c ),
*(c + 5*rs_c + 0*cs_c ),
*(c + 4*rs_c + 0*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b0);
// Scale by beta
vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 );
// Add gemm result
vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp );
// Store back to memory
//_mm256_store_pd( c40, vc4_7_0 );
aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 0*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 0*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb );
// Calculate address
c01 = ( c + 0*rs_c + 1*cs_c );
// Load
//vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c );
vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ),
*(c + 2*rs_c + 1*cs_c ),
*(c + 1*rs_c + 1*cs_c ),
*(c + 0*rs_c + 1*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b1);
// Scale by beta
vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 );
// Add gemm result
vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp );
// Store back to memory
//_mm256_store_pd( c01, vc0_3_1 );
aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 1*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 1*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb );
// Calculate address
c41 = ( c + 4*rs_c + 1*cs_c );
// Load
//vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c );
vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ),
*(c + 6*rs_c + 1*cs_c ),
*(c + 5*rs_c + 1*cs_c ),
*(c + 4*rs_c + 1*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b1);
// Scale by beta
vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 );
// Add gemm result
vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp );
// Store back to memory
//_mm256_store_pd( c41, vc4_7_1 );
aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 1*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 1*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb );
// Calculate address
c02 = ( c + 0*rs_c + 2*cs_c );
// Load
//vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c );
vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ),
*(c + 2*rs_c + 2*cs_c ),
*(c + 1*rs_c + 2*cs_c ),
*(c + 0*rs_c + 2*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b2);
// Scale by beta
vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 );
// Add gemm result
vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp );
// Store back to memory
//_mm256_store_pd( c02, vc0_3_2 );
aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 2*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 2*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb );
// Calculate address
c42 = ( c + 4*rs_c + 2*cs_c );
// Load
//vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c );
vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ),
*(c + 6*rs_c + 2*cs_c ),
*(c + 5*rs_c + 2*cs_c ),
*(c + 4*rs_c + 2*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b2);
// Scale by beta
vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 );
// Add gemm result
vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp );
// Store back to memory
//_mm256_store_pd( c42, vc4_7_2 );
aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 2*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 2*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb );
// Calculate address
c03 = ( c + 0*rs_c + 3*cs_c );
// Load
//vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c );
vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ),
*(c + 2*rs_c + 3*cs_c ),
*(c + 1*rs_c + 3*cs_c ),
*(c + 0*rs_c + 3*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va0_3b3);
// Scale by beta
vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 );
// Add gemm result
vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp );
// Store back to memory
//_mm256_store_pd( c03, vc0_3_3 );
aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ;
bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ;
_mm_storel_pd( c + 0*rs_c + 3*cs_c, aa );
_mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa );
_mm_storel_pd( c + 2*rs_c + 3*cs_c, bb );
_mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb );
// Calculate address
c43 = ( c + 4*rs_c + 3*cs_c );
// Load
//vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c );
vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ),
*(c + 6*rs_c + 3*cs_c ),
*(c + 5*rs_c + 3*cs_c ),
*(c + 4*rs_c + 3*cs_c ) );
// Scale by alpha
vtmp = _mm256_mul_pd( valpha, va4_7b3);
// Scale by beta
vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 );
// Add gemm result
vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp );
// Store back to memory
//_mm256_store_pd( c43, vc4_7_3 );
aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ;
bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ;
_mm_storel_pd( c + 4*rs_c + 3*cs_c, aa );
_mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa );
_mm_storel_pd( c + 6*rs_c + 3*cs_c, bb );
_mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb );
}
}
#if 0
void bli_cgemm_sandybridge_int_8x4
(
dim_t k0,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
}
#endif
#if 0
void bli_zgemm_sandybridge_int_4x4
(
dim_t k0,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
}
#endif