Arm micro-architecture dispatch (#344)

Details:
- Reworked support for ARM hardware detection in bli_cpuid.c to parse 
  the result of a CPUID-like instruction.
- Added a64fx support to bli_gks.c.
- #include arm64 and arm32 family headers from bli_arch_config.h.
- Fix the ordering of the "armsve" and "a64fx" strings in the 
  config_name string array in bli_arch.c. The ordering did not match
  the ordering of the corresponding arch_t values in bli_type_defs.h,
  as it should have all along.
- Added clang support to make_defs.mk in arm64, cortexa53, cortexa57 
  subconfigs.
- Updated arm64 and arm32 families in config_registry.
- Updated docs/HardwareSupport.md to reflect added ARM support.
- Thanks to Dave Love, RuQing Xu, and Devin Matthews for their
  contributions in this PR (#344).
This commit is contained in:
Dave Love
2021-10-04 18:03:04 +00:00
committed by GitHub
parent 0a45bc0fbc
commit d0a0b4b841
10 changed files with 277 additions and 112 deletions

View File

@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -march=armv8-a
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -march=armv8-a
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.

View File

@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a53
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a53
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.

View File

@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a57
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a57
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.

View File

@@ -11,10 +11,8 @@
x86_64: intel64 amd64
intel64: skx knl haswell sandybridge penryn generic
amd64: zen2 zen excavator steamroller piledriver bulldozer generic
# NOTE: ARM families will remain disabled until runtime hardware detection
# logic is added to BLIS.
#arm64: cortexa57 generic
#arm32: cortexa15 cortexa9 generic
arm64: thunderx2 cortexa57 cortexa53 generic
arm32: cortexa15 cortexa9 generic
# Intel architectures.
skx: skx/skx/haswell/zen

2
configure vendored
View File

@@ -1505,6 +1505,8 @@ check_compiler()
echo "${script_name}: checking for blacklisted configurations due to ${cc} ${cc_version}."
# Fixme: check on a64fx, neoverse, and others
# gcc
if [ "x${cc_vendor}" = "xgcc" ]; then

View File

@@ -24,7 +24,7 @@ A few remarks / reminders:
| AMD Steamroller (AVX/FMA3) | `steamroller` | `sdcz` | |
| AMD Excavator (AVX/FMA3) | `excavator` | `sdcz` | |
| AMD Zen (AVX/FMA3) | `zen` | `sdcz` | `sd` |
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
| Intel Sandy/Ivy Bridge (AVX/FMA3) | `sandybridge` | `sdcz` | |
| Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
@@ -35,6 +35,8 @@ A few remarks / reminders:
| ARMv7 Cortex-A15 (NEON) | `cortex-a15` | `sd` | |
| ARMv8 Cortex-A53 (NEON) | `cortex-a53` | `sd` | |
| ARMv8 Cortex-A57 (NEON) | `cortex-a57` | `sd` | |
| ARMv8.1 ThunderX2 (NEON) | `thunderx2` | `sd` | |
| ARMv8.1 A64FX (SVE) | `a64fx` | `d` | |
| IBM Blue Gene/Q (QPX int) | `bgq` | `d` | |
| IBM Power7 (QPX int) | `power7` | `d` | |
| template (C99) | `template` | `sdcz` | `sdcz` |

View File

@@ -263,11 +263,11 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
"piledriver",
"bulldozer",
"armsve",
"a64fx",
"thunderx2",
"cortexa57",
"cortexa53",
"armsve",
"a64fx",
"cortexa15",
"cortexa9",

View File

@@ -6,7 +6,6 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
Copyright (C) 2019, Dave Love, University of Manchester
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -454,9 +453,6 @@ arch_t bli_cpuid_query_id( void )
{
uint32_t vendor, model, part, features;
// Call the CPUID instruction and parse its results into a model id,
// part id, and a feature bit field. The return value encodes the
// vendor.
vendor = bli_cpuid_query( &model, &part, &features );
#if 0
@@ -472,24 +468,9 @@ arch_t bli_cpuid_query_id( void )
{
if ( model == MODEL_ARMV8 )
{
return part;
// Check for each ARMv8 configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_ARMSVE
if ( bli_cpuid_is_armsve( model, part, features ) )
return BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_CONFIG_A64FX
if ( bli_cpuid_is_a64fx( model, part, features ) )
return BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_CONFIG_THUNDERX2
if ( bli_cpuid_is_thunderx2( model, part, features ) )
return BLIS_ARCH_THUNDERX2;
#endif
#ifdef BLIS_CONFIG_CORTEXA57
if ( bli_cpuid_is_cortexa57( model, part, features ) )
return BLIS_ARCH_CORTEXA57;
#endif
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
return BLIS_ARCH_GENERIC;
@@ -519,81 +500,6 @@ arch_t bli_cpuid_query_id( void )
return BLIS_ARCH_GENERIC;
}
bool bli_cpuid_is_thunderx2
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_cortexa57
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_cortexa53
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_armsve
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_a64fx
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SVE;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_cortexa15
(
uint32_t family,
@@ -604,9 +510,7 @@ bool bli_cpuid_is_cortexa15
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
return bli_cpuid_has_features( features, expected ) && model == 0xc0f;
}
bool bli_cpuid_is_cortexa9
@@ -619,9 +523,7 @@ bool bli_cpuid_is_cortexa9
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
return bli_cpuid_has_features( features, expected ) && model == 0xc09;
}
#endif
@@ -1042,7 +944,243 @@ int vpu_count( void )
}
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
#elif defined(__aarch64__)
#ifdef __linux__
// This is adapted from OpenBLAS. See
// https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html
// for the mechanism, but not the magic numbers.
// Fixme: Could these be missing in older Linux?
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
/* From https://www.kernel.org/doc/html/latest/arm64/sve.html and the
aarch64 hwcap.h */
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
/* Maybe also for AT_HWCAP2
#define HWCAP2_SVE2(1 << 1)
et al
) */
#endif //__linux__
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
static uint32_t get_coretype
(
uint32_t* features
)
{
int implementer = 0x00, part = 0x000;
*features = FEATURE_NEON;
#ifdef __linux__
if ( getauxval( AT_HWCAP ) & HWCAP_CPUID )
{
// Also available from
// /sys/devices/system/cpu/cpu0/regs/identification/midr_el1
// and split out in /proc/cpuinfo (with a tab before the colon):
// CPU part : 0x0a1
uint64_t midr_el1;
__asm("mrs %0, MIDR_EL1" : "=r" (midr_el1));
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
}
bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE;
if (has_sve)
*features |= FEATURE_SVE;
#endif //__linux__
#ifdef __APPLE__
// Better values could be obtained from sysctlbyname()
implementer = 0x61; //Apple
part = 0x023; //Firestorm
#endif //__APPLE__
// From Linux arch/arm64/include/asm/cputype.h
// ARM_CPU_IMP_ARM 0x41
// ARM_CPU_IMP_APM 0x50
// ARM_CPU_IMP_CAVIUM 0x43
// ARM_CPU_IMP_BRCM 0x42
// ARM_CPU_IMP_QCOM 0x51
// ARM_CPU_IMP_NVIDIA 0x4E
// ARM_CPU_IMP_FUJITSU 0x46
// ARM_CPU_IMP_HISI 0x48
// ARM_CPU_IMP_APPLE 0x61
//
// ARM_CPU_PART_AEM_V8 0xD0F
// ARM_CPU_PART_FOUNDATION 0xD00
// ARM_CPU_PART_CORTEX_A57 0xD07
// ARM_CPU_PART_CORTEX_A72 0xD08
// ARM_CPU_PART_CORTEX_A53 0xD03
// ARM_CPU_PART_CORTEX_A73 0xD09
// ARM_CPU_PART_CORTEX_A75 0xD0A
// ARM_CPU_PART_CORTEX_A35 0xD04
// ARM_CPU_PART_CORTEX_A55 0xD05
// ARM_CPU_PART_CORTEX_A76 0xD0B
// ARM_CPU_PART_NEOVERSE_N1 0xD0C
// ARM_CPU_PART_CORTEX_A77 0xD0D
// from GCC:
// ARM_CPU_PART_CORTEX_A78 0xd41
// ARM_CPU_PART_CORTEX_X1 0xd44
// ARM_CPU_PART_CORTEX_V1 0xd40
// ARM_CPU_PART_CORTEX_N2 0xd49
// ARM_CPU_PART_CORTEX_R82 0xd15
//
// APM_CPU_PART_POTENZA 0x000
//
// CAVIUM_CPU_PART_THUNDERX 0x0A1
// CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
// CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
// CAVIUM_CPU_PART_THUNDERX2 0x0AF
// CAVIUM_CPU_PART_THUNDERX3 0x0B8 // taken from OpenBLAS
//
// BRCM_CPU_PART_BRAHMA_B53 0x100
// BRCM_CPU_PART_VULCAN 0x516
//
// QCOM_CPU_PART_FALKOR_V1 0x800
// QCOM_CPU_PART_FALKOR 0xC00
// QCOM_CPU_PART_KRYO 0x200
// QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
// QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
// QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
//
// NVIDIA_CPU_PART_DENVER 0x003
// NVIDIA_CPU_PART_CARMEL 0x004
//
// FUJITSU_CPU_PART_A64FX 0x001
//
// HISI_CPU_PART_TSV110 0xD01
// APPLE_CPU_PART_M1_ICESTORM 0x022
// APPLE_CPU_PART_M1_FIRESTORM 0x023
// Fixme: After merging the vpu_count branch we could report the
// part here with bli_dolog.
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
#ifdef BLIS_CONFIG_CORTEXA57
case 0xd07: // Cortex A57
return BLIS_ARCH_CORTEXA57;
#endif
#ifdef BLIS_CONFIG_CORTEXA53
case 0xd03: // Cortex A53
return BLIS_ARCH_CORTEXA53;
#endif
#ifdef BLIS_CONFIG_THUNDERX2
case 0xd0c: // Neoverse N1 (and Graviton G2?)
return BLIS_ARCH_THUNDERX2; //placeholder for N1
#endif
}
break;
case 0x42: // Broadcom
switch (part)
{
#ifdef BLIS_CONFIG_THUNDERX2
case 0x516: // Vulcan
return BLIS_ARCH_THUNDERX2;
#endif
}
break;
case 0x43: // Cavium
switch (part)
{
#ifdef BLIS_CONFIG_THUNDERX2
case 0x0af: // ThunderX2
case 0x0b8: // ThunderX3
return BLIS_ARCH_THUNDERX2;
#endif
}
break;
case 0x46: // Fujitsu
switch (part)
{
#ifdef BLIS_CONFIG_A64FX
case 0x001: // A64FX
return BLIS_ARCH_A64FX;
#endif
}
break;
case 0x61: // Apple
switch (part)
{
#ifdef BLIS_CONFIG_THUNDERX2
case 0x022: // Icestorm (M1.LITTLE)
case 0x023: // Firestorm (M1.big)
return BLIS_ARCH_THUNDERX2; //placeholder for M1
#endif
}
break;
}
#ifdef BLIS_CONFIG_ARMSVE
if (has_sve)
return BLIS_ARCH_ARMSVE;
#endif
// Can't use #if defined(...) here because of parsing done for autoconfiguration
#ifdef BLIS_CONFIG_CORTEXA57
return BLIS_ARCH_CORTEXA57;
#else
#ifdef BLIS_CONFIG_CORTEXA53
return BLIS_ARCH_CORTEXA53;
#else
return BLIS_ARCH_GENERIC;
#endif
#endif
}
uint32_t bli_cpuid_query
(
uint32_t* model,
uint32_t* part,
uint32_t* features
)
{
*model = MODEL_ARMV8;
*part = get_coretype(features);
return VENDOR_ARM;
}
#elif defined(__arm__) || defined(_M_ARM)
/*
I can't easily find documentation to do this as for aarch64, though
it presumably could be unearthed from Linux code. However, on
Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of
thing, used below:
CPU implementer : 0x41
CPU architecture: 7
CPU variant : 0x3
CPU part : 0xc09
The complication for family selection is that Neon is optional for
CortexA9, for instance. That's tested in bli_cpuid_is_cortexa9.
*/
#define TEMP_BUFFER_SIZE 200

View File

@@ -129,6 +129,11 @@ void bli_gks_init( void )
#endif
// ARM architectures
#ifdef BLIS_CONFIG_A64FX
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
bli_cntx_init_a64fx_ref,
bli_cntx_init_a64fx_ind );
#endif
#ifdef BLIS_CONFIG_THUNDERX2
bli_gks_register_cntx( BLIS_ARCH_THUNDERX2, bli_cntx_init_thunderx2,
bli_cntx_init_thunderx2_ref,

View File

@@ -187,6 +187,14 @@ CNTX_INIT_PROTS( generic )
#include "bli_family_bulldozer.h"
#endif
// -- ARM families --
#ifdef BLIS_FAMILY_ARM64
#include "bli_family_arm64.h"
#endif
#ifdef BLIS_FAMILY_ARM32
#include "bli_family_arm32.h"
#endif
// -- ARM architectures --
#ifdef BLIS_FAMILY_ARMSVE