mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Arm micro-architecture dispatch (#344)
Details: - Reworked support for ARM hardware detection in bli_cpuid.c to parse the result of a CPUID-like instruction. - Added a64fx support to bli_gks.c. - #include arm64 and arm32 family headers from bli_arch_config.h. - Fix the ordering of the "armsve" and "a64fx" strings in the config_name string array in bli_arch.c. The ordering did not match the ordering of the corresponding arch_t values in bli_type_defs.h, as it should have all along. - Added clang support to make_defs.mk in arm64, cortexa53, cortexa57 subconfigs. - Updated arm64 and arm32 families in config_registry. - Updated docs/HardwareSupport.md to reflect added ARM support. - Thanks to Dave Love, RuQing Xu, and Devin Matthews for their contributions in this PR (#344).
This commit is contained in:
@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -march=armv8-a
|
||||
else
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -march=armv8-a
|
||||
else
|
||||
$(error gcc or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
|
||||
@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mcpu=cortex-a53
|
||||
else
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mcpu=cortex-a53
|
||||
else
|
||||
$(error gcc or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
|
||||
@@ -65,7 +65,11 @@ CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mcpu=cortex-a57
|
||||
else
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CKVECFLAGS := -mcpu=cortex-a57
|
||||
else
|
||||
$(error gcc or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
|
||||
@@ -11,10 +11,8 @@
|
||||
x86_64: intel64 amd64
|
||||
intel64: skx knl haswell sandybridge penryn generic
|
||||
amd64: zen2 zen excavator steamroller piledriver bulldozer generic
|
||||
# NOTE: ARM families will remain disabled until runtime hardware detection
|
||||
# logic is added to BLIS.
|
||||
#arm64: cortexa57 generic
|
||||
#arm32: cortexa15 cortexa9 generic
|
||||
arm64: thunderx2 cortexa57 cortexa53 generic
|
||||
arm32: cortexa15 cortexa9 generic
|
||||
|
||||
# Intel architectures.
|
||||
skx: skx/skx/haswell/zen
|
||||
|
||||
2
configure
vendored
2
configure
vendored
@@ -1505,6 +1505,8 @@ check_compiler()
|
||||
|
||||
echo "${script_name}: checking for blacklisted configurations due to ${cc} ${cc_version}."
|
||||
|
||||
# Fixme: check on a64fx, neoverse, and others
|
||||
|
||||
# gcc
|
||||
if [ "x${cc_vendor}" = "xgcc" ]; then
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ A few remarks / reminders:
|
||||
| AMD Steamroller (AVX/FMA3) | `steamroller` | `sdcz` | |
|
||||
| AMD Excavator (AVX/FMA3) | `excavator` | `sdcz` | |
|
||||
| AMD Zen (AVX/FMA3) | `zen` | `sdcz` | `sd` |
|
||||
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
|
||||
| Intel Core2 (SSE3) | `penryn` | `sd` | `d` |
|
||||
| Intel Sandy/Ivy Bridge (AVX/FMA3) | `sandybridge` | `sdcz` | |
|
||||
| Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
@@ -35,6 +35,8 @@ A few remarks / reminders:
|
||||
| ARMv7 Cortex-A15 (NEON) | `cortex-a15` | `sd` | |
|
||||
| ARMv8 Cortex-A53 (NEON) | `cortex-a53` | `sd` | |
|
||||
| ARMv8 Cortex-A57 (NEON) | `cortex-a57` | `sd` | |
|
||||
| ARMv8.1 ThunderX2 (NEON) | `thunderx2` | `sd` | |
|
||||
| ARMv8.1 A64FX (SVE) | `a64fx` | `d` | |
|
||||
| IBM Blue Gene/Q (QPX int) | `bgq` | `d` | |
|
||||
| IBM Power7 (QPX int) | `power7` | `d` | |
|
||||
| template (C99) | `template` | `sdcz` | `sdcz` |
|
||||
|
||||
@@ -263,11 +263,11 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
|
||||
"piledriver",
|
||||
"bulldozer",
|
||||
|
||||
"armsve",
|
||||
"a64fx",
|
||||
"thunderx2",
|
||||
"cortexa57",
|
||||
"cortexa53",
|
||||
"armsve",
|
||||
"a64fx",
|
||||
"cortexa15",
|
||||
"cortexa9",
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019, Dave Love, University of Manchester
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -454,9 +453,6 @@ arch_t bli_cpuid_query_id( void )
|
||||
{
|
||||
uint32_t vendor, model, part, features;
|
||||
|
||||
// Call the CPUID instruction and parse its results into a model id,
|
||||
// part id, and a feature bit field. The return value encodes the
|
||||
// vendor.
|
||||
vendor = bli_cpuid_query( &model, &part, &features );
|
||||
|
||||
#if 0
|
||||
@@ -472,24 +468,9 @@ arch_t bli_cpuid_query_id( void )
|
||||
{
|
||||
if ( model == MODEL_ARMV8 )
|
||||
{
|
||||
return part;
|
||||
// Check for each ARMv8 configuration that is enabled, check for that
|
||||
// microarchitecture. We check from most recent to most dated.
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
if ( bli_cpuid_is_armsve( model, part, features ) )
|
||||
return BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
if ( bli_cpuid_is_a64fx( model, part, features ) )
|
||||
return BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
if ( bli_cpuid_is_thunderx2( model, part, features ) )
|
||||
return BLIS_ARCH_THUNDERX2;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_CORTEXA57
|
||||
if ( bli_cpuid_is_cortexa57( model, part, features ) )
|
||||
return BLIS_ARCH_CORTEXA57;
|
||||
#endif
|
||||
// If none of the other sub-configurations were detected, return
|
||||
// the 'generic' arch_t id value.
|
||||
return BLIS_ARCH_GENERIC;
|
||||
@@ -519,81 +500,6 @@ arch_t bli_cpuid_query_id( void )
|
||||
return BLIS_ARCH_GENERIC;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_thunderx2
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_NEON;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa57
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_NEON;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa53
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_NEON;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_armsve
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_a64fx
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_SVE;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa15
|
||||
(
|
||||
uint32_t family,
|
||||
@@ -604,9 +510,7 @@ bool bli_cpuid_is_cortexa15
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_NEON;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
return bli_cpuid_has_features( features, expected ) && model == 0xc0f;
|
||||
}
|
||||
|
||||
bool bli_cpuid_is_cortexa9
|
||||
@@ -619,9 +523,7 @@ bool bli_cpuid_is_cortexa9
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_NEON;
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
|
||||
|
||||
return TRUE;
|
||||
return bli_cpuid_has_features( features, expected ) && model == 0xc09;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1042,7 +944,243 @@ int vpu_count( void )
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
#ifdef __linux__
|
||||
// This is adapted from OpenBLAS. See
|
||||
// https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html
|
||||
// for the mechanism, but not the magic numbers.
|
||||
|
||||
// Fixme: Could these be missing in older Linux?
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
/* From https://www.kernel.org/doc/html/latest/arm64/sve.html and the
|
||||
aarch64 hwcap.h */
|
||||
#ifndef HWCAP_SVE
|
||||
#define HWCAP_SVE (1 << 22)
|
||||
#endif
|
||||
/* Maybe also for AT_HWCAP2
|
||||
#define HWCAP2_SVE2(1 << 1)
|
||||
et al
|
||||
) */
|
||||
|
||||
#endif //__linux__
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
static uint32_t get_coretype
|
||||
(
|
||||
uint32_t* features
|
||||
)
|
||||
{
|
||||
int implementer = 0x00, part = 0x000;
|
||||
*features = FEATURE_NEON;
|
||||
|
||||
#ifdef __linux__
|
||||
if ( getauxval( AT_HWCAP ) & HWCAP_CPUID )
|
||||
{
|
||||
// Also available from
|
||||
// /sys/devices/system/cpu/cpu0/regs/identification/midr_el1
|
||||
// and split out in /proc/cpuinfo (with a tab before the colon):
|
||||
// CPU part : 0x0a1
|
||||
|
||||
uint64_t midr_el1;
|
||||
__asm("mrs %0, MIDR_EL1" : "=r" (midr_el1));
|
||||
/*
|
||||
* MIDR_EL1
|
||||
*
|
||||
* 31 24 23 20 19 16 15 4 3 0
|
||||
* -----------------------------------------------------------------
|
||||
* | Implementer | Variant | Architecture | Part Number | Revision |
|
||||
* -----------------------------------------------------------------
|
||||
*/
|
||||
implementer = (midr_el1 >> 24) & 0xFF;
|
||||
part = (midr_el1 >> 4) & 0xFFF;
|
||||
}
|
||||
|
||||
bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE;
|
||||
if (has_sve)
|
||||
*features |= FEATURE_SVE;
|
||||
#endif //__linux__
|
||||
|
||||
#ifdef __APPLE__
|
||||
// Better values could be obtained from sysctlbyname()
|
||||
implementer = 0x61; //Apple
|
||||
part = 0x023; //Firestorm
|
||||
#endif //__APPLE__
|
||||
|
||||
// From Linux arch/arm64/include/asm/cputype.h
|
||||
// ARM_CPU_IMP_ARM 0x41
|
||||
// ARM_CPU_IMP_APM 0x50
|
||||
// ARM_CPU_IMP_CAVIUM 0x43
|
||||
// ARM_CPU_IMP_BRCM 0x42
|
||||
// ARM_CPU_IMP_QCOM 0x51
|
||||
// ARM_CPU_IMP_NVIDIA 0x4E
|
||||
// ARM_CPU_IMP_FUJITSU 0x46
|
||||
// ARM_CPU_IMP_HISI 0x48
|
||||
// ARM_CPU_IMP_APPLE 0x61
|
||||
//
|
||||
// ARM_CPU_PART_AEM_V8 0xD0F
|
||||
// ARM_CPU_PART_FOUNDATION 0xD00
|
||||
// ARM_CPU_PART_CORTEX_A57 0xD07
|
||||
// ARM_CPU_PART_CORTEX_A72 0xD08
|
||||
// ARM_CPU_PART_CORTEX_A53 0xD03
|
||||
// ARM_CPU_PART_CORTEX_A73 0xD09
|
||||
// ARM_CPU_PART_CORTEX_A75 0xD0A
|
||||
// ARM_CPU_PART_CORTEX_A35 0xD04
|
||||
// ARM_CPU_PART_CORTEX_A55 0xD05
|
||||
// ARM_CPU_PART_CORTEX_A76 0xD0B
|
||||
// ARM_CPU_PART_NEOVERSE_N1 0xD0C
|
||||
// ARM_CPU_PART_CORTEX_A77 0xD0D
|
||||
// from GCC:
|
||||
// ARM_CPU_PART_CORTEX_A78 0xd41
|
||||
// ARM_CPU_PART_CORTEX_X1 0xd44
|
||||
// ARM_CPU_PART_CORTEX_V1 0xd40
|
||||
// ARM_CPU_PART_CORTEX_N2 0xd49
|
||||
// ARM_CPU_PART_CORTEX_R82 0xd15
|
||||
//
|
||||
// APM_CPU_PART_POTENZA 0x000
|
||||
//
|
||||
// CAVIUM_CPU_PART_THUNDERX 0x0A1
|
||||
// CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
|
||||
// CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
|
||||
// CAVIUM_CPU_PART_THUNDERX2 0x0AF
|
||||
// CAVIUM_CPU_PART_THUNDERX3 0x0B8 // taken from OpenBLAS
|
||||
//
|
||||
// BRCM_CPU_PART_BRAHMA_B53 0x100
|
||||
// BRCM_CPU_PART_VULCAN 0x516
|
||||
//
|
||||
// QCOM_CPU_PART_FALKOR_V1 0x800
|
||||
// QCOM_CPU_PART_FALKOR 0xC00
|
||||
// QCOM_CPU_PART_KRYO 0x200
|
||||
// QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
|
||||
// QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
|
||||
// QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
|
||||
//
|
||||
// NVIDIA_CPU_PART_DENVER 0x003
|
||||
// NVIDIA_CPU_PART_CARMEL 0x004
|
||||
//
|
||||
// FUJITSU_CPU_PART_A64FX 0x001
|
||||
//
|
||||
// HISI_CPU_PART_TSV110 0xD01
|
||||
|
||||
// APPLE_CPU_PART_M1_ICESTORM 0x022
|
||||
// APPLE_CPU_PART_M1_FIRESTORM 0x023
|
||||
|
||||
// Fixme: After merging the vpu_count branch we could report the
|
||||
// part here with bli_dolog.
|
||||
switch(implementer)
|
||||
{
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
#ifdef BLIS_CONFIG_CORTEXA57
|
||||
case 0xd07: // Cortex A57
|
||||
return BLIS_ARCH_CORTEXA57;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_CORTEXA53
|
||||
case 0xd03: // Cortex A53
|
||||
return BLIS_ARCH_CORTEXA53;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
case 0xd0c: // Neoverse N1 (and Graviton G2?)
|
||||
return BLIS_ARCH_THUNDERX2; //placeholder for N1
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
switch (part)
|
||||
{
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
case 0x516: // Vulcan
|
||||
return BLIS_ARCH_THUNDERX2;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 0x43: // Cavium
|
||||
switch (part)
|
||||
{
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
case 0x0af: // ThunderX2
|
||||
case 0x0b8: // ThunderX3
|
||||
return BLIS_ARCH_THUNDERX2;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 0x46: // Fujitsu
|
||||
switch (part)
|
||||
{
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
case 0x001: // A64FX
|
||||
return BLIS_ARCH_A64FX;
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case 0x61: // Apple
|
||||
switch (part)
|
||||
{
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
case 0x022: // Icestorm (M1.LITTLE)
|
||||
case 0x023: // Firestorm (M1.big)
|
||||
return BLIS_ARCH_THUNDERX2; //placeholder for M1
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_ARMSVE
|
||||
if (has_sve)
|
||||
return BLIS_ARCH_ARMSVE;
|
||||
#endif
|
||||
|
||||
// Can't use #if defined(...) here because of parsing done for autoconfiguration
|
||||
#ifdef BLIS_CONFIG_CORTEXA57
|
||||
return BLIS_ARCH_CORTEXA57;
|
||||
#else
|
||||
#ifdef BLIS_CONFIG_CORTEXA53
|
||||
return BLIS_ARCH_CORTEXA53;
|
||||
#else
|
||||
return BLIS_ARCH_GENERIC;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t bli_cpuid_query
|
||||
(
|
||||
uint32_t* model,
|
||||
uint32_t* part,
|
||||
uint32_t* features
|
||||
)
|
||||
{
|
||||
*model = MODEL_ARMV8;
|
||||
*part = get_coretype(features);
|
||||
|
||||
return VENDOR_ARM;
|
||||
}
|
||||
|
||||
#elif defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
/*
|
||||
I can't easily find documentation to do this as for aarch64, though
|
||||
it presumably could be unearthed from Linux code. However, on
|
||||
Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of
|
||||
thing, used below:
|
||||
|
||||
CPU implementer : 0x41
|
||||
CPU architecture: 7
|
||||
CPU variant : 0x3
|
||||
CPU part : 0xc09
|
||||
|
||||
The complication for family selection is that Neon is optional for
|
||||
CortexA9, for instance. That's tested in bli_cpuid_is_cortexa9.
|
||||
*/
|
||||
|
||||
#define TEMP_BUFFER_SIZE 200
|
||||
|
||||
|
||||
@@ -129,6 +129,11 @@ void bli_gks_init( void )
|
||||
#endif
|
||||
|
||||
// ARM architectures
|
||||
#ifdef BLIS_CONFIG_A64FX
|
||||
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
|
||||
bli_cntx_init_a64fx_ref,
|
||||
bli_cntx_init_a64fx_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_THUNDERX2
|
||||
bli_gks_register_cntx( BLIS_ARCH_THUNDERX2, bli_cntx_init_thunderx2,
|
||||
bli_cntx_init_thunderx2_ref,
|
||||
|
||||
@@ -187,6 +187,14 @@ CNTX_INIT_PROTS( generic )
|
||||
#include "bli_family_bulldozer.h"
|
||||
#endif
|
||||
|
||||
// -- ARM families --
|
||||
#ifdef BLIS_FAMILY_ARM64
|
||||
#include "bli_family_arm64.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_ARM32
|
||||
#include "bli_family_arm32.h"
|
||||
#endif
|
||||
|
||||
// -- ARM architectures --
|
||||
|
||||
#ifdef BLIS_FAMILY_ARMSVE
|
||||
|
||||
Reference in New Issue
Block a user