mirror of
https://github.com/amd/blis.git
synced 2026-05-13 18:52:14 +00:00
Armv8-A Introduced s/d Packing Kernels
Sizes according to the 2014 kernels.
This commit is contained in:
323
kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
Normal file
323
kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
Normal file
@@ -0,0 +1,323 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(__clang__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("unroll 2")
|
||||
#elif define(__GNUC__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("GCC nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2")
|
||||
#else
|
||||
#define PRAGMA_NOUNROLL
|
||||
#define PRAGMA_UNROLL_2
|
||||
#endif
|
||||
|
||||
void bli_dpackm_armv8a_int_6xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim0,
|
||||
dim_t k0,
|
||||
dim_t k0_max,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca0, inc_t lda0,
|
||||
double* restrict p, inc_t ldp0,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// This is the panel dimension assumed by the packm kernel.
|
||||
const dim_t mnr = 6;
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_iter = k0 / 2;
|
||||
uint64_t k_left = k0 % 2;
|
||||
double* a_loc = a;
|
||||
double* p_loc = p;
|
||||
|
||||
// NOTE: For the purposes of the comments in this packm kernel, we
|
||||
// interpret inca and lda as rs_a and cs_a, respectively, and similarly
|
||||
// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
|
||||
// this packm kernel, you should think of the operation as packing an
|
||||
// m x n micropanel, where m and n are tiny and large, respectively, and
|
||||
// where elements of each column of the packed matrix P are contiguous.
|
||||
// (This packm kernel can still be used to pack micropanels of matrix B
|
||||
// in a gemm operation.)
|
||||
const uint64_t inca = inca0;
|
||||
const uint64_t lda = lda0;
|
||||
const uint64_t ldp = ldp0;
|
||||
|
||||
const bool gs = ( inca0 != 1 && lda0 != 1 );
|
||||
|
||||
// NOTE: If/when this kernel ever supports scaling by kappa within the
|
||||
// assembly region, this constraint should be lifted.
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
if ( cdim0 == mnr && !gs )
|
||||
{
|
||||
if ( unitk )
|
||||
{
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
|
||||
{
|
||||
poly128_t v0 = vldrq_p128( a_loc + 0 );
|
||||
poly128_t v1 = vldrq_p128( a_loc + 2 );
|
||||
poly128_t v2 = vldrq_p128( a_loc + 4 );
|
||||
|
||||
vstrq_p128( p_loc + 0, v0 );
|
||||
vstrq_p128( p_loc + 2, v1 );
|
||||
vstrq_p128( p_loc + 4, v2 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float64x2_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float64x2_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float64x2_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float64x2_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float64x2_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float64x2_t)vldrq_p128( a_loc + inca * 5 );
|
||||
|
||||
// In-register transpose.
|
||||
float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
|
||||
float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
|
||||
float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
|
||||
float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
|
||||
float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
|
||||
float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_1 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_1 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_2 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_2 );
|
||||
p_loc += ldp;
|
||||
a_loc += 2 * lda; // 2;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( !unitk )
|
||||
{
|
||||
float64x2_t vkappa = vld1q_dup_f64( kappa );
|
||||
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vldrq_p128( a_loc + 0 );
|
||||
float64x2_t v1 = (float64x2_t)vldrq_p128( a_loc + 2 );
|
||||
float64x2_t v2 = (float64x2_t)vldrq_p128( a_loc + 4 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float64x2_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float64x2_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float64x2_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float64x2_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float64x2_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float64x2_t)vldrq_p128( a_loc + inca * 5 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
v3 = vmulq_f64( v3, vkappa );
|
||||
v4 = vmulq_f64( v4, vkappa );
|
||||
v5 = vmulq_f64( v5, vkappa );
|
||||
|
||||
// In-register transpose.
|
||||
float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
|
||||
float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
|
||||
float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
|
||||
float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
|
||||
float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
|
||||
float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_1 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_1 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_2 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_2 );
|
||||
p_loc += ldp;
|
||||
a_loc += 2 * lda; // 2;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( cdim0 < mnr || gs )
|
||||
{
|
||||
PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim0,
|
||||
k0,
|
||||
kappa,
|
||||
a, inca0, lda0,
|
||||
p, 1, ldp0,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if ( cdim0 < mnr )
|
||||
{
|
||||
// Handle zero-filling along the "long" edge of the micropanel.
|
||||
|
||||
const dim_t i = cdim0;
|
||||
const dim_t m_edge = mnr - cdim0;
|
||||
const dim_t n_edge = k0_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
//bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
|
||||
|
||||
if ( k0 < k0_max )
|
||||
{
|
||||
// Handle zero-filling along the "short" (far) edge of the micropanel.
|
||||
|
||||
const dim_t j = k0;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = k0_max - k0;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
353
kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
Normal file
353
kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
Normal file
@@ -0,0 +1,353 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(__clang__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("unroll 2")
|
||||
#elif define(__GNUC__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("GCC nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2")
|
||||
#else
|
||||
#define PRAGMA_NOUNROLL
|
||||
#define PRAGMA_UNROLL_2
|
||||
#endif
|
||||
|
||||
void bli_dpackm_armv8a_int_8xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim0,
|
||||
dim_t k0,
|
||||
dim_t k0_max,
|
||||
double* restrict kappa,
|
||||
double* restrict a, inc_t inca0, inc_t lda0,
|
||||
double* restrict p, inc_t ldp0,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// This is the panel dimension assumed by the packm kernel.
|
||||
const dim_t mnr = 8;
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_iter = k0 / 2;
|
||||
uint64_t k_left = k0 % 2;
|
||||
double* a_loc = a;
|
||||
double* p_loc = p;
|
||||
|
||||
// NOTE: For the purposes of the comments in this packm kernel, we
|
||||
// interpret inca and lda as rs_a and cs_a, respectively, and similarly
|
||||
// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
|
||||
// this packm kernel, you should think of the operation as packing an
|
||||
// m x n micropanel, where m and n are tiny and large, respectively, and
|
||||
// where elements of each column of the packed matrix P are contiguous.
|
||||
// (This packm kernel can still be used to pack micropanels of matrix B
|
||||
// in a gemm operation.)
|
||||
const uint64_t inca = inca0;
|
||||
const uint64_t lda = lda0;
|
||||
const uint64_t ldp = ldp0;
|
||||
|
||||
const bool gs = ( inca0 != 1 && lda0 != 1 );
|
||||
|
||||
// NOTE: If/when this kernel ever supports scaling by kappa within the
|
||||
// assembly region, this constraint should be lifted.
|
||||
const bool unitk = bli_deq1( *kappa );
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
if ( cdim0 == mnr && !gs )
|
||||
{
|
||||
if ( unitk )
|
||||
{
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
|
||||
{
|
||||
poly128_t v0 = vldrq_p128( a_loc + 0 );
|
||||
poly128_t v1 = vldrq_p128( a_loc + 2 );
|
||||
poly128_t v2 = vldrq_p128( a_loc + 4 );
|
||||
poly128_t v3 = vldrq_p128( a_loc + 6 );
|
||||
|
||||
vstrq_p128( p_loc + 0, v0 );
|
||||
vstrq_p128( p_loc + 2, v1 );
|
||||
vstrq_p128( p_loc + 4, v2 );
|
||||
vstrq_p128( p_loc + 6, v3 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v6 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v7 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float64x2_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float64x2_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float64x2_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float64x2_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float64x2_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float64x2_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float64x2_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float64x2_t)vldrq_p128( a_loc + inca * 7 );
|
||||
|
||||
// In-register transpose.
|
||||
float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
|
||||
float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
|
||||
float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
|
||||
float64x2_t vd3_1 = vtrn1q_f64( v6, v7 );
|
||||
float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
|
||||
float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
|
||||
float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
|
||||
float64x2_t vd3_2 = vtrn2q_f64( v6, v7 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_1 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_1 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)vd3_1 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_2 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_2 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)vd3_2 );
|
||||
p_loc += ldp;
|
||||
a_loc += 2 * lda; // 2;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
|
||||
v3 = vld1q_lane_f64( a_loc + inca * 6, v3, 0 );
|
||||
v3 = vld1q_lane_f64( a_loc + inca * 7, v3, 1 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)v3 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( !unitk )
|
||||
{
|
||||
float64x2_t vkappa = vld1q_dup_f64( kappa );
|
||||
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vldrq_p128( a_loc + 0 );
|
||||
float64x2_t v1 = (float64x2_t)vldrq_p128( a_loc + 2 );
|
||||
float64x2_t v2 = (float64x2_t)vldrq_p128( a_loc + 4 );
|
||||
float64x2_t v3 = (float64x2_t)vldrq_p128( a_loc + 6 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
v3 = vmulq_f64( v3, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)v3 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v6 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
float64x2_t v7 = (float64x2_t)vdupq_n_u64( 0 );
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float64x2_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float64x2_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float64x2_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float64x2_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float64x2_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float64x2_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float64x2_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float64x2_t)vldrq_p128( a_loc + inca * 7 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
v3 = vmulq_f64( v3, vkappa );
|
||||
v4 = vmulq_f64( v4, vkappa );
|
||||
v5 = vmulq_f64( v5, vkappa );
|
||||
v6 = vmulq_f64( v6, vkappa );
|
||||
v7 = vmulq_f64( v7, vkappa );
|
||||
|
||||
// In-register transpose.
|
||||
float64x2_t vd0_1 = vtrn1q_f64( v0, v1 );
|
||||
float64x2_t vd1_1 = vtrn1q_f64( v2, v3 );
|
||||
float64x2_t vd2_1 = vtrn1q_f64( v4, v5 );
|
||||
float64x2_t vd3_1 = vtrn1q_f64( v6, v7 );
|
||||
float64x2_t vd0_2 = vtrn2q_f64( v0, v1 );
|
||||
float64x2_t vd1_2 = vtrn2q_f64( v2, v3 );
|
||||
float64x2_t vd2_2 = vtrn2q_f64( v4, v5 );
|
||||
float64x2_t vd3_2 = vtrn2q_f64( v6, v7 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_1 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_1 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)vd3_1 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)vd0_2 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)vd1_2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)vd2_2 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)vd3_2 );
|
||||
p_loc += ldp;
|
||||
a_loc += 2 * lda; // 2;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 );
|
||||
v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 );
|
||||
v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 );
|
||||
v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 );
|
||||
v3 = vld1q_lane_f64( a_loc + inca * 6, v3, 0 );
|
||||
v3 = vld1q_lane_f64( a_loc + inca * 7, v3, 1 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f64( v0, vkappa );
|
||||
v1 = vmulq_f64( v1, vkappa );
|
||||
v2 = vmulq_f64( v2, vkappa );
|
||||
v3 = vmulq_f64( v3, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 2, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 6, (poly128_t)v3 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( cdim0 < mnr || gs )
|
||||
{
|
||||
PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim0,
|
||||
k0,
|
||||
kappa,
|
||||
a, inca0, lda0,
|
||||
p, 1, ldp0,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if ( cdim0 < mnr )
|
||||
{
|
||||
// Handle zero-filling along the "long" edge of the micropanel.
|
||||
|
||||
const dim_t i = cdim0;
|
||||
const dim_t m_edge = mnr - cdim0;
|
||||
const dim_t n_edge = k0_max;
|
||||
double* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
//bli_dfprintm( stdout, "packm 8xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" );
|
||||
|
||||
if ( k0 < k0_max )
|
||||
{
|
||||
// Handle zero-filling along the "short" (far) edge of the micropanel.
|
||||
|
||||
const dim_t j = k0;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = k0_max - k0;
|
||||
double* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_dset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
435
kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
Normal file
435
kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
Normal file
@@ -0,0 +1,435 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(__clang__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("unroll 2")
|
||||
#elif define(__GNUC__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("GCC nounroll")
|
||||
#define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2")
|
||||
#else
|
||||
#define PRAGMA_NOUNROLL
|
||||
#define PRAGMA_UNROLL_2
|
||||
#endif
|
||||
|
||||
void bli_spackm_armv8a_int_12xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim0,
|
||||
dim_t k0,
|
||||
dim_t k0_max,
|
||||
float* restrict kappa,
|
||||
float* restrict a, inc_t inca0, inc_t lda0,
|
||||
float* restrict p, inc_t ldp0,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// This is the panel dimension assumed by the packm kernel.
|
||||
const dim_t mnr = 12;
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_iter = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
float* a_loc = a;
|
||||
float* p_loc = p;
|
||||
|
||||
// NOTE: For the purposes of the comments in this packm kernel, we
|
||||
// interpret inca and lda as rs_a and cs_a, respectively, and similarly
|
||||
// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
|
||||
// this packm kernel, you should think of the operation as packing an
|
||||
// m x n micropanel, where m and n are tiny and large, respectively, and
|
||||
// where elements of each column of the packed matrix P are contiguous.
|
||||
// (This packm kernel can still be used to pack micropanels of matrix B
|
||||
// in a gemm operation.)
|
||||
const uint64_t inca = inca0;
|
||||
const uint64_t lda = lda0;
|
||||
const uint64_t ldp = ldp0;
|
||||
|
||||
const bool gs = ( inca0 != 1 && lda0 != 1 );
|
||||
|
||||
// NOTE: If/when this kernel ever supports scaling by kappa within the
|
||||
// assembly region, this constraint should be lifted.
|
||||
const bool unitk = bli_seq1( *kappa );
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
if ( cdim0 == mnr && !gs )
|
||||
{
|
||||
if ( unitk )
|
||||
{
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
|
||||
{
|
||||
poly128_t v0 = vldrq_p128( a_loc + 0 );
|
||||
poly128_t v1 = vldrq_p128( a_loc + 4 );
|
||||
poly128_t v2 = vldrq_p128( a_loc + 8 );
|
||||
|
||||
vstrq_p128( p_loc + 0, v0 );
|
||||
vstrq_p128( p_loc + 4, v1 );
|
||||
vstrq_p128( p_loc + 8, v2 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v8 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v9 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v10 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v11 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t vt0;
|
||||
float32x4_t vt1;
|
||||
float32x4_t vt2;
|
||||
float32x4_t vt3;
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float32x4_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float32x4_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float32x4_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float32x4_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float32x4_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float32x4_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float32x4_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float32x4_t)vldrq_p128( a_loc + inca * 7 );
|
||||
v8 = (float32x4_t)vldrq_p128( a_loc + inca * 8 );
|
||||
v9 = (float32x4_t)vldrq_p128( a_loc + inca * 9 );
|
||||
v10 = (float32x4_t)vldrq_p128( a_loc + inca * 10 );
|
||||
v11 = (float32x4_t)vldrq_p128( a_loc + inca * 11 );
|
||||
|
||||
// In-register transpose.
|
||||
//
|
||||
// Column 0-3
|
||||
vt0 = vtrn1q_f32( v0, v1 );
|
||||
vt1 = vtrn2q_f32( v0, v1 );
|
||||
vt2 = vtrn1q_f32( v2, v3 );
|
||||
vt3 = vtrn2q_f32( v2, v3 );
|
||||
v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 4-7
|
||||
vt0 = vtrn1q_f32( v4, v5 );
|
||||
vt1 = vtrn2q_f32( v4, v5 );
|
||||
vt2 = vtrn1q_f32( v6, v7 );
|
||||
vt3 = vtrn2q_f32( v6, v7 );
|
||||
v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 8-11
|
||||
vt0 = vtrn1q_f32( v8, v9 );
|
||||
vt1 = vtrn2q_f32( v8, v9 );
|
||||
vt2 = vtrn1q_f32( v10, v11 );
|
||||
vt3 = vtrn2q_f32( v10, v11 );
|
||||
v8 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v9 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v10 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v11 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v4 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v8 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v5 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v9 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v6 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v10 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v3 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v7 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v11 );
|
||||
p_loc += ldp;
|
||||
a_loc += 4 * lda; // 4;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 8 , v2, 0 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 9 , v2, 1 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 10, v2, 2 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 11, v2, 3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v2 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( !unitk )
|
||||
{
|
||||
float32x4_t vkappa = vld1q_dup_f32( kappa );
|
||||
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_2
|
||||
for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vldrq_p128( a_loc + 0 );
|
||||
float32x4_t v1 = (float32x4_t)vldrq_p128( a_loc + 4 );
|
||||
float32x4_t v2 = (float32x4_t)vldrq_p128( a_loc + 8 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
v2 = vmulq_f32( v2, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v2 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v8 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v9 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v10 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v11 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t vt0;
|
||||
float32x4_t vt1;
|
||||
float32x4_t vt2;
|
||||
float32x4_t vt3;
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float32x4_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float32x4_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float32x4_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float32x4_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float32x4_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float32x4_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float32x4_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float32x4_t)vldrq_p128( a_loc + inca * 7 );
|
||||
v8 = (float32x4_t)vldrq_p128( a_loc + inca * 8 );
|
||||
v9 = (float32x4_t)vldrq_p128( a_loc + inca * 9 );
|
||||
v10 = (float32x4_t)vldrq_p128( a_loc + inca * 10 );
|
||||
v11 = (float32x4_t)vldrq_p128( a_loc + inca * 11 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
v2 = vmulq_f32( v2, vkappa );
|
||||
v3 = vmulq_f32( v3, vkappa );
|
||||
v4 = vmulq_f32( v4, vkappa );
|
||||
v5 = vmulq_f32( v5, vkappa );
|
||||
v6 = vmulq_f32( v6, vkappa );
|
||||
v7 = vmulq_f32( v7, vkappa );
|
||||
v8 = vmulq_f32( v8, vkappa );
|
||||
v9 = vmulq_f32( v9, vkappa );
|
||||
v10 = vmulq_f32( v10, vkappa );
|
||||
v11 = vmulq_f32( v11, vkappa );
|
||||
|
||||
// In-register transpose.
|
||||
//
|
||||
// Column 0-3
|
||||
vt0 = vtrn1q_f32( v0, v1 );
|
||||
vt1 = vtrn2q_f32( v0, v1 );
|
||||
vt2 = vtrn1q_f32( v2, v3 );
|
||||
vt3 = vtrn2q_f32( v2, v3 );
|
||||
v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 4-7
|
||||
vt0 = vtrn1q_f32( v4, v5 );
|
||||
vt1 = vtrn2q_f32( v4, v5 );
|
||||
vt2 = vtrn1q_f32( v6, v7 );
|
||||
vt3 = vtrn2q_f32( v6, v7 );
|
||||
v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 8-11
|
||||
vt0 = vtrn1q_f32( v8, v9 );
|
||||
vt1 = vtrn2q_f32( v8, v9 );
|
||||
vt2 = vtrn1q_f32( v10, v11 );
|
||||
vt3 = vtrn2q_f32( v10, v11 );
|
||||
v8 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v9 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v10 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v11 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v4 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v8 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v5 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v9 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v6 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v10 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v3 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v7 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v11 );
|
||||
p_loc += ldp;
|
||||
a_loc += 4 * lda; // 4;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 8 , v2, 0 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 9 , v2, 1 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 10, v2, 2 );
|
||||
v2 = vld1q_lane_f32( a_loc + inca * 11, v2, 3 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
v2 = vmulq_f32( v2, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 8, (poly128_t)v2 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( cdim0 < mnr || gs )
|
||||
{
|
||||
PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim0,
|
||||
k0,
|
||||
kappa,
|
||||
a, inca0, lda0,
|
||||
p, 1, ldp0,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if ( cdim0 < mnr )
|
||||
{
|
||||
// Handle zero-filling along the "long" edge of the micropanel.
|
||||
|
||||
const dim_t i = cdim0;
|
||||
const dim_t m_edge = mnr - cdim0;
|
||||
const dim_t n_edge = k0_max;
|
||||
float* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_sset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( k0 < k0_max )
|
||||
{
|
||||
// Handle zero-filling along the "short" (far) edge of the micropanel.
|
||||
|
||||
const dim_t j = k0;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = k0_max - k0;
|
||||
float* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_sset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
373
kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
Normal file
373
kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
Normal file
@@ -0,0 +1,373 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Linaro Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(__clang__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("nounroll")
|
||||
#define PRAGMA_UNROLL_4 _Pragma("unroll 4")
|
||||
#elif define(__GNUC__)
|
||||
#define PRAGMA_NOUNROLL _Pragma("GCC nounroll")
|
||||
#define PRAGMA_UNROLL_4 _Pragma("GCC unroll 4")
|
||||
#else
|
||||
#define PRAGMA_NOUNROLL
|
||||
#define PRAGMA_UNROLL_4
|
||||
#endif
|
||||
|
||||
void bli_spackm_armv8a_int_8xk
|
||||
(
|
||||
conj_t conja,
|
||||
pack_t schema,
|
||||
dim_t cdim0,
|
||||
dim_t k0,
|
||||
dim_t k0_max,
|
||||
float* restrict kappa,
|
||||
float* restrict a, inc_t inca0, inc_t lda0,
|
||||
float* restrict p, inc_t ldp0,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// This is the panel dimension assumed by the packm kernel.
|
||||
const dim_t mnr = 8;
|
||||
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
uint64_t k_iter = k0 / 4;
|
||||
uint64_t k_left = k0 % 4;
|
||||
float* a_loc = a;
|
||||
float* p_loc = p;
|
||||
|
||||
// NOTE: For the purposes of the comments in this packm kernel, we
|
||||
// interpret inca and lda as rs_a and cs_a, respectively, and similarly
|
||||
// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
|
||||
// this packm kernel, you should think of the operation as packing an
|
||||
// m x n micropanel, where m and n are tiny and large, respectively, and
|
||||
// where elements of each column of the packed matrix P are contiguous.
|
||||
// (This packm kernel can still be used to pack micropanels of matrix B
|
||||
// in a gemm operation.)
|
||||
const uint64_t inca = inca0;
|
||||
const uint64_t lda = lda0;
|
||||
const uint64_t ldp = ldp0;
|
||||
|
||||
const bool gs = ( inca0 != 1 && lda0 != 1 );
|
||||
|
||||
// NOTE: If/when this kernel ever supports scaling by kappa within the
|
||||
// assembly region, this constraint should be lifted.
|
||||
const bool unitk = bli_seq1( *kappa );
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
if ( cdim0 == mnr && !gs )
|
||||
{
|
||||
if ( unitk )
|
||||
{
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_4
|
||||
for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
|
||||
{
|
||||
poly128_t v0 = vldrq_p128( a_loc + 0 );
|
||||
poly128_t v1 = vldrq_p128( a_loc + 4 );
|
||||
|
||||
vstrq_p128( p_loc + 0, v0 );
|
||||
vstrq_p128( p_loc + 4, v1 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t vt0;
|
||||
float32x4_t vt1;
|
||||
float32x4_t vt2;
|
||||
float32x4_t vt3;
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float32x4_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float32x4_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float32x4_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float32x4_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float32x4_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float32x4_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float32x4_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float32x4_t)vldrq_p128( a_loc + inca * 7 );
|
||||
|
||||
// In-register transpose.
|
||||
//
|
||||
// Column 0-3
|
||||
vt0 = vtrn1q_f32( v0, v1 );
|
||||
vt1 = vtrn2q_f32( v0, v1 );
|
||||
vt2 = vtrn1q_f32( v2, v3 );
|
||||
vt3 = vtrn2q_f32( v2, v3 );
|
||||
v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 4-7
|
||||
vt0 = vtrn1q_f32( v4, v5 );
|
||||
vt1 = vtrn2q_f32( v4, v5 );
|
||||
vt2 = vtrn1q_f32( v6, v7 );
|
||||
vt3 = vtrn2q_f32( v6, v7 );
|
||||
v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v4 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v5 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v6 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v3 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v7 );
|
||||
p_loc += ldp;
|
||||
a_loc += 4 * lda; // 4;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( !unitk )
|
||||
{
|
||||
float32x4_t vkappa = vld1q_dup_f32( kappa );
|
||||
|
||||
if ( inca == 1 )
|
||||
{
|
||||
// No need to use k-loops here.
|
||||
// Simply let compiler to expand loops.
|
||||
PRAGMA_UNROLL_4
|
||||
for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vldrq_p128( a_loc + 0 );
|
||||
float32x4_t v1 = (float32x4_t)vldrq_p128( a_loc + 4 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
|
||||
a_loc += lda;
|
||||
p_loc += ldp;
|
||||
}
|
||||
}
|
||||
else // if ( lda == 1 )
|
||||
{
|
||||
float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 );
|
||||
float32x4_t vt0;
|
||||
float32x4_t vt1;
|
||||
float32x4_t vt2;
|
||||
float32x4_t vt3;
|
||||
|
||||
PRAGMA_NOUNROLL
|
||||
for ( ; k_iter > 0; --k_iter )
|
||||
{
|
||||
v0 = (float32x4_t)vldrq_p128( a_loc + inca * 0 );
|
||||
v1 = (float32x4_t)vldrq_p128( a_loc + inca * 1 );
|
||||
v2 = (float32x4_t)vldrq_p128( a_loc + inca * 2 );
|
||||
v3 = (float32x4_t)vldrq_p128( a_loc + inca * 3 );
|
||||
v4 = (float32x4_t)vldrq_p128( a_loc + inca * 4 );
|
||||
v5 = (float32x4_t)vldrq_p128( a_loc + inca * 5 );
|
||||
v6 = (float32x4_t)vldrq_p128( a_loc + inca * 6 );
|
||||
v7 = (float32x4_t)vldrq_p128( a_loc + inca * 7 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
v2 = vmulq_f32( v2, vkappa );
|
||||
v3 = vmulq_f32( v3, vkappa );
|
||||
v4 = vmulq_f32( v4, vkappa );
|
||||
v5 = vmulq_f32( v5, vkappa );
|
||||
v6 = vmulq_f32( v6, vkappa );
|
||||
v7 = vmulq_f32( v7, vkappa );
|
||||
|
||||
// In-register transpose.
|
||||
//
|
||||
// Column 0-3
|
||||
vt0 = vtrn1q_f32( v0, v1 );
|
||||
vt1 = vtrn2q_f32( v0, v1 );
|
||||
vt2 = vtrn1q_f32( v2, v3 );
|
||||
vt3 = vtrn2q_f32( v2, v3 );
|
||||
v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
// Column 4-7
|
||||
vt0 = vtrn1q_f32( v4, v5 );
|
||||
vt1 = vtrn2q_f32( v4, v5 );
|
||||
vt2 = vtrn1q_f32( v6, v7 );
|
||||
vt3 = vtrn2q_f32( v6, v7 );
|
||||
v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 );
|
||||
v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v4 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v1 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v5 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v2 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v6 );
|
||||
p_loc += ldp;
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v3 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v7 );
|
||||
p_loc += ldp;
|
||||
a_loc += 4 * lda; // 4;
|
||||
}
|
||||
for ( ; k_left > 0; --k_left )
|
||||
{
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 );
|
||||
v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 );
|
||||
v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 );
|
||||
|
||||
// Scale by kappa.
|
||||
v0 = vmulq_f32( v0, vkappa );
|
||||
v1 = vmulq_f32( v1, vkappa );
|
||||
|
||||
vstrq_p128( p_loc + 0, (poly128_t)v0 );
|
||||
vstrq_p128( p_loc + 4, (poly128_t)v1 );
|
||||
p_loc += ldp;
|
||||
a_loc += lda; // 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if ( cdim0 < mnr || gs )
|
||||
{
|
||||
PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
0,
|
||||
BLIS_NONUNIT_DIAG,
|
||||
BLIS_DENSE,
|
||||
( trans_t )conja,
|
||||
cdim0,
|
||||
k0,
|
||||
kappa,
|
||||
a, inca0, lda0,
|
||||
p, 1, ldp0,
|
||||
cntx,
|
||||
NULL
|
||||
);
|
||||
|
||||
if ( cdim0 < mnr )
|
||||
{
|
||||
// Handle zero-filling along the "long" edge of the micropanel.
|
||||
|
||||
const dim_t i = cdim0;
|
||||
const dim_t m_edge = mnr - cdim0;
|
||||
const dim_t n_edge = k0_max;
|
||||
float* restrict p_edge = p + (i )*1;
|
||||
|
||||
bli_sset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ( k0 < k0_max )
|
||||
{
|
||||
// Handle zero-filling along the "short" (far) edge of the micropanel.
|
||||
|
||||
const dim_t j = k0;
|
||||
const dim_t m_edge = mnr;
|
||||
const dim_t n_edge = k0_max - k0;
|
||||
float* restrict p_edge = p + (j )*ldp;
|
||||
|
||||
bli_sset0s_mxn
|
||||
(
|
||||
m_edge,
|
||||
n_edge,
|
||||
p_edge, 1, ldp
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
PACKM_KER_PROT( float, s, packm_armv8a_int_8xk )
|
||||
PACKM_KER_PROT( float, s, packm_armv8a_int_12xk )
|
||||
PACKM_KER_PROT( double, d, packm_armv8a_int_6xk )
|
||||
PACKM_KER_PROT( double, d, packm_armv8a_int_8xk )
|
||||
|
||||
GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r )
|
||||
|
||||
Reference in New Issue
Block a user