Generalized ref kernels' pragma omp simd usage.

Details:
- Replaced direct usage of _Pragma( "omp simd" ) in reference kernels
  with PRAGMA_SIMD, which is defined as a function of the compiler being
  used in a new bli_pragma_macro_defs.h file. That definition is cleared
  when BLIS detects that the -fopenmp-simd command line option is
  unsupported. Thanks to Devin Matthews and Jeff Hammond for suggestions
  that guided this commit.
- Updated configure and bli_config.h.in so that the appropriate anchor
  is substituted in (when the corresponding pragma omp simd support is
  present).
This commit is contained in:
Field G. Van Zee
2019-02-12 16:01:28 -06:00
parent b1f5ce8622
commit 6b83273126
25 changed files with 150 additions and 54 deletions

View File

@@ -141,6 +141,12 @@
#define BLIS_DISABLE_MEMKIND
#endif
#if @enable_pragma_omp_simd@
#define BLIS_ENABLE_PRAGMA_OMP_SIMD
#else
#define BLIS_DISABLE_PRAGMA_OMP_SIMD
#endif
#if @enable_sandbox@
#define BLIS_ENABLE_SANDBOX
#else

3
configure vendored
View File

@@ -2668,8 +2668,10 @@ main()
fi
if [ "x${pragma_omp_simd}" = "xyes" ]; then
echo "${script_name}: compiler appears to support #pragma omp simd."
enable_pragma_omp_simd_01=1
else
echo "${script_name}: compiler appears to not support #pragma omp simd."
enable_pragma_omp_simd_01=0
fi
if [ "x${enable_blas}" = "xyes" ]; then
echo "${script_name}: the BLAS compatibility layer is enabled."
@@ -2909,6 +2911,7 @@ main()
| sed -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
| sed -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \
| sed -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
| sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \
| sed -e "s/@enable_shared@/${enable_shared_01}/g" \
> "${bli_config_h_out_path}"

View File

@@ -0,0 +1,82 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
NOTE: The following code is based on [1].
[1] https://github.com/jeffhammond/nwchem-tce-triples-kernels/blob/master/src/pragma_vendor.h
*/
#ifndef BLIS_PRAGMA_MACRO_DEFS_H
#define BLIS_PRAGMA_MACRO_DEFS_H
// Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define
// all instances of PRAGMA_SIMD as _Pragma("omp simd").
#ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD
#define PRAGMA_OMP_SIMD _Pragma("omp simd")
#else
#define PRAGMA_OMP_SIMD
#endif
// Require ISO C99 or later for SIMD-related pragmas.
#if (( __STDC_VERSION__ >= 199901L ))
#define GEN_PRAGMA(x) _Pragma(#x)
#if defined(__ICC) || defined(__INTEL_COMPILER)
// Intel icc.
//#define PRAGMA_SIMD GEN_PRAGMA(simd)
#define PRAGMA_SIMD PRAGMA_OMP_SIMD
#elif defined(__clang__)
// clang/llvm.
#define PRAGMA_SIMD PRAGMA_OMP_SIMD
#elif defined(__GNUC__)
// GNU gcc.
#define PRAGMA_SIMD PRAGMA_OMP_SIMD
#else
// Unknown compiler.
#define PRAGMA_SIMD
#endif
#endif
#endif

View File

@@ -66,6 +66,11 @@ extern "C" {
#include "bli_macro_defs.h"
// -- pragma definitions --
#include "bli_pragma_macro_defs.h"
// -- Threading definitions --
#include "bli_thread.h"

View File

@@ -55,7 +55,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,addjs)( chi1[i], psi1[i] ); \
@@ -76,7 +76,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,adds)( chi1[i], psi1[i] ); \

View File

@@ -203,7 +203,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbyjs)( *alpha, x[i], *beta, y[i] ); \
@@ -224,7 +224,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbys)( *alpha, x[i], *beta, y[i] ); \

View File

@@ -78,7 +78,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
/*PASTEMAC(ch,axpyjs)( *alpha, chi1[i], psi1[i] );*/ \
@@ -100,7 +100,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
/*PASTEMAC(ch,axpys)( *alpha, chi1[i], psi1[i] );*/ \
@@ -165,7 +165,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( *alpha, x[i], y[i] ); \
@@ -186,7 +186,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, x[i], y[i] ); \

View File

@@ -52,7 +52,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( x[i], y[i] ); \
@@ -73,7 +73,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( x[i], y[i] ); \

View File

@@ -70,7 +70,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
@@ -91,7 +91,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \

View File

@@ -79,7 +79,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
@@ -100,7 +100,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \

View File

@@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( incx == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,inverts)( x[i] ); \

View File

@@ -92,7 +92,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2js)( *alpha, x[i], y[i] ); \
@@ -113,7 +113,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, x[i], y[i] ); \

View File

@@ -77,7 +77,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( incx == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scals)( alpha_conj, x[i] ); \

View File

@@ -52,7 +52,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,set0s)( x[i] ); \
@@ -76,7 +76,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( incx == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( alpha_conj, x[i] ); \

View File

@@ -52,7 +52,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subjs)( x[i], y[i] ); \
@@ -73,7 +73,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subs)( x[i], y[i] ); \

View File

@@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,swaps)( x[i], y[i] ); \

View File

@@ -88,7 +88,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbyjs)( x[i], *beta, y[i] ); \
@@ -109,7 +109,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbys)( x[i], *beta, y[i] ); \

View File

@@ -61,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( bli_is_noconj( conjy ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
@@ -70,7 +70,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else /* if ( bli_is_conj( conjy ) ) */ \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( bli_is_noconj( conjy ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( x[i], chic ); \
@@ -93,7 +93,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else /* if ( bli_is_conj( conjy ) ) */ \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( x[i], chic ); \

View File

@@ -60,13 +60,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* Scale x by alpha, storing to a temporary array ax. */ \
if ( bli_is_conj( conjx ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t j = 0; j < ff; ++j ) \
PASTEMAC(ch,scal2js)( *alpha, x[j], ax[j] ); \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t j = 0; j < ff; ++j ) \
PASTEMAC(ch,scal2s)( *alpha, x[j], ax[j] ); \
} \
@@ -74,7 +74,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* Accumulate ff separate axpyv's into y. */ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
for ( dim_t j = 0; j < ff; ++j ) \
{ \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
for ( dim_t j = 0; j < ff; ++j ) \
{ \

View File

@@ -68,7 +68,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( bli_is_noconj( conjxt_use ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
@@ -77,7 +77,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else /* bli_is_conj( conjxt_use ) ) */ \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
@@ -102,7 +102,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( bli_is_noconj( conjxt_use ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
@@ -111,7 +111,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else /* bli_is_conj( conjxt_use ) ) */ \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \

View File

@@ -85,13 +85,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* Scale x by alpha, storing to a temporary array ax. */ \
if ( bli_is_conj( conjx ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < ff; ++i ) \
PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < ff; ++i ) \
PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \
} \
@@ -108,7 +108,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
@@ -118,7 +118,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
@@ -131,7 +131,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
@@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \

View File

@@ -82,7 +82,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
if ( bli_is_noconj( conjx_use ) ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
@@ -91,7 +91,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \

View File

@@ -65,7 +65,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,set0s)( ab[ i ] ); \
@@ -76,7 +76,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
@@ -93,7 +93,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
\
/* Scale the result in ab by alpha. */ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \

View File

@@ -57,7 +57,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
const inc_t rs_b = nr; \
const inc_t cs_b = 1; \
\
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
/* b1 = b1 - a10t * B0; */ \
@@ -117,7 +117,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
const inc_t rs_b = nr; \
const inc_t cs_b = 1; \
\
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t iter = 0; iter < mr; ++iter ) \
{ \
dim_t i = mr - iter - 1; \

View File

@@ -60,7 +60,7 @@ void PASTEMAC4(ch,opname,arch,_simd,suf) \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,set0s)( ab[ i ] ); \
@@ -76,7 +76,7 @@ void PASTEMAC4(ch,opname,arch,_simd,suf) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
@@ -103,7 +103,7 @@ void PASTEMAC4(ch,opname,arch,_simd,suf) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
@@ -120,7 +120,7 @@ void PASTEMAC4(ch,opname,arch,_simd,suf) \
} \
\
/* Scale the result in ab by alpha. */ \
_Pragma( "omp simd" ) \
PRAGMA_SIMD \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \