mirror of
https://github.com/amd/blis.git
synced 2026-06-08 14:58:42 +00:00
Fix parsing in vpu_count on workstation SKX (#351)
* Fix parsing in vpu_count on workstation SKX * Document Skylake-X as Haswell for single FMA * Update vpu_count for Skylake and Cascade Lake models * Support printing the configuration selected, controlled by the environment Intended particularly for diagnosing mis-selection of SKX through unknown, or incorrect, number of VPUs. * Move bli_log outside the cpp condition, and use it where intended * Add Fixme comment (Skylake D) * Mostly superficial edits to commits towards #351. Details: - Moved architecture/sub-config logging-related code from bli_cpuid.c to bli_arch.c, tweaked names, and added more set/get layering. - Tweaked log messages output from bli_cpuid_is_skx() in bli_cpuid.c. - Content, whitespace changes to new bullet in HardwareSupport.md that relates to single-VPU Skylake-Xs. * Fix comment typos Co-authored-by: Field G. Van Zee <field@cs.utexas.edu>
This commit is contained in:
committed by
Devrajegowda, Kiran
parent
99da76fd64
commit
291ee5f748
@@ -15,6 +15,7 @@ A few remarks / reminders:
|
||||
* Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic.
|
||||
* Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels.
|
||||
* Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically.
|
||||
* There is a difficulty in automatically choosing the ideal sub-configuration for use on Skylake-X systems, which may have one or two FMA units. The `skx` sub-configuration is only beneficial when used on hardware with two FMA units. Otherwise the hardware is treated as a "desktop" Skylake system, which uses the `haswell` sub-configuration. Furthermore, the number of units can't be queried directly; instead, we rely on a manually-maintained list of CPU models (via logic in `frame/base/bli_cpuid.c`), which may be incorrect for new processors, particularly Gold models. In that case, you can either fix the code (and please raise an issue!) or manually target the `skx` at configure-time (i.e., `./configure [options] skx`). If your performance seems low, you can set `export BLIS_ARCH_DEBUG=1`, which will cause BLIS to output some basic debugging info to `stderr` that will reveal whether your system was detected as having one or two VPUs (FMA units).
|
||||
|
||||
| Vendor/Microarchitecture | BLIS sub-configuration | `gemm` | `gemmtrsm` |
|
||||
|:-------------------------------------|:-----------------------|:-------|:-----------|
|
||||
@@ -28,7 +29,8 @@ A few remarks / reminders:
|
||||
| Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| Intel Knights Landing (AVX-512/FMA3) | `knl` | `sd` | |
|
||||
| Intel SkylakeX (AVX-512/FMA3) | `skx` | `sd` | |
|
||||
| Intel SkylakeX (AVX-512/2×FMA3) | `skx` | `sd` | |
|
||||
| Intel SkylakeX (AVX-512/1×FMA3) | `haswell` | `sdcz` | `sd` |
|
||||
| ARMv7 Cortex-A9 (NEON) | `cortex-a9` | `sd` | |
|
||||
| ARMv7 Cortex-A15 (NEON) | `cortex-a15` | `sd` | |
|
||||
| ARMv8 Cortex-A53 (NEON) | `cortex-a53` | `sd` | |
|
||||
|
||||
@@ -74,6 +74,12 @@ void bli_arch_set_id_once( void )
|
||||
|
||||
void bli_arch_set_id( void )
|
||||
{
|
||||
// NOTE: Change this usage of getenv() to bli_env_get_var() after
|
||||
// merging #351.
|
||||
//bool_t do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 );
|
||||
bool_t do_logging = getenv( "BLIS_ARCH_DEBUG" ) != NULL;
|
||||
bli_arch_set_logging( do_logging );
|
||||
|
||||
// Architecture families.
|
||||
#if defined BLIS_FAMILY_INTEL64 || \
|
||||
defined BLIS_FAMILY_AMD64 || \
|
||||
@@ -156,6 +162,10 @@ void bli_arch_set_id( void )
|
||||
id = BLIS_ARCH_GENERIC;
|
||||
#endif
|
||||
|
||||
if ( bli_arch_get_logging() )
|
||||
fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
|
||||
bli_arch_string( id ) );
|
||||
|
||||
//printf( "blis_arch_query_id(): id = %u\n", id );
|
||||
//exit(1);
|
||||
}
|
||||
@@ -200,3 +210,37 @@ char* bli_arch_string( arch_t id )
|
||||
return config_name[ id ];
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static bool_t arch_dolog = 0;
|
||||
|
||||
void bli_arch_set_logging( bool_t dolog )
|
||||
{
|
||||
arch_dolog = dolog;
|
||||
}
|
||||
|
||||
bool_t bli_arch_get_logging( void )
|
||||
{
|
||||
return arch_dolog;
|
||||
}
|
||||
|
||||
void bli_arch_log( char* fmt, ... )
|
||||
{
|
||||
char prefix[] = "libblis: ";
|
||||
int n_chars = strlen( prefix ) + strlen( fmt ) + 1;
|
||||
|
||||
if ( bli_arch_get_logging() && fmt )
|
||||
{
|
||||
char* prefix_fmt = malloc( n_chars );
|
||||
|
||||
snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt );
|
||||
|
||||
va_list ap;
|
||||
va_start( ap, fmt );
|
||||
vfprintf( stderr, prefix_fmt, ap );
|
||||
va_end( ap );
|
||||
|
||||
free( prefix_fmt );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -42,6 +42,9 @@ void bli_arch_set_id( void );
|
||||
|
||||
BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id );
|
||||
|
||||
void bli_arch_set_logging( bool_t dolog );
|
||||
bool_t bli_arch_get_logging( void );
|
||||
void bli_arch_log( char*, ... );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019, Dave Love, University of Manchester
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -52,6 +53,7 @@
|
||||
#include "bli_cpuid.h"
|
||||
#else
|
||||
#include "blis.h"
|
||||
#include "bli_arch.h"
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -167,7 +169,22 @@ bool_t bli_cpuid_is_skx
|
||||
|
||||
int nvpu = vpu_count();
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 )
|
||||
if ( bli_cpuid_has_features( features, expected ) )
|
||||
{
|
||||
switch ( nvpu )
|
||||
{
|
||||
case 1:
|
||||
bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" );
|
||||
return FALSE;
|
||||
case 2:
|
||||
bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" );
|
||||
return TRUE;
|
||||
default:
|
||||
bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" );
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
else
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
@@ -893,6 +910,10 @@ void get_cpu_name( char *cpu_name )
|
||||
*( uint32_t* )&cpu_name[32+12] = edx;
|
||||
}
|
||||
|
||||
// Return the number of FMA units _assuming avx512 is supported_.
|
||||
// This needs updating for new processor types, sigh.
|
||||
// See https://ark.intel.com/content/www/us/en/ark.html#@Processors
|
||||
// and also https://github.com/jeffhammond/vpu-count
|
||||
int vpu_count( void )
|
||||
{
|
||||
char cpu_name[48] = {};
|
||||
@@ -904,49 +925,59 @@ int vpu_count( void )
|
||||
|
||||
if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL )
|
||||
{
|
||||
loc = strstr( cpu_name, "Platinum" );
|
||||
if (( loc = strstr( cpu_name, "Platinum" ) ))
|
||||
return 2;
|
||||
if ( loc == NULL )
|
||||
loc = strstr( cpu_name, "Gold" );
|
||||
loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below
|
||||
if ( loc == NULL )
|
||||
loc = strstr( cpu_name, "Silver" );
|
||||
if (( loc = strstr( cpu_name, "Silver" ) ))
|
||||
return 1;
|
||||
if ( loc == NULL )
|
||||
loc = strstr( cpu_name, "Bronze" );
|
||||
if (( loc = strstr( cpu_name, "Bronze" ) ))
|
||||
return 1;
|
||||
if ( loc == NULL )
|
||||
loc = strstr( cpu_name, "W" );
|
||||
if ( loc == NULL )
|
||||
if (( loc = strstr( cpu_name, "D" ) ))
|
||||
// Fixme: May be wrong
|
||||
// <https://github.com/jeffhammond/vpu-count/issues/3#issuecomment-542044651>
|
||||
return 1;
|
||||
if ( loc == NULL )
|
||||
return -1;
|
||||
|
||||
loc = strstr( loc+1, " " );
|
||||
// We may have W-nnnn rather than, say, Gold nnnn
|
||||
if ( 'W' == *loc && '-' == *(loc+1) )
|
||||
loc++;
|
||||
else
|
||||
loc = strstr( loc+1, " " );
|
||||
if ( loc == NULL )
|
||||
return -1;
|
||||
|
||||
strncpy( model_num, loc+1, 4 );
|
||||
model_num[4] = '\0';
|
||||
model_num[4] = '\0'; // Things like i9-10900X matched above
|
||||
|
||||
sku = atoi( model_num );
|
||||
|
||||
// These were derived from ARK listings as of 2019-10-09, but
|
||||
// may not be complete, especially as the ARK Skylake listing
|
||||
// seems to be limited.
|
||||
if ( 8199 >= sku && sku >= 8100 ) return 2;
|
||||
else if ( 6199 >= sku && sku >= 6100 ) return 2;
|
||||
else if ( sku == 5122 ) return 2;
|
||||
else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold
|
||||
else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold
|
||||
else if ( 5199 >= sku && sku >= 5100 ) return 1;
|
||||
else if ( 4199 >= sku && sku >= 4100 ) return 1;
|
||||
else if ( 3199 >= sku && sku >= 3100 ) return 1;
|
||||
else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W
|
||||
else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W
|
||||
else if ( 2199 >= sku && sku >= 2120 ) return 2;
|
||||
else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions
|
||||
else if ( 2119 >= sku && sku >= 2100 ) return 1;
|
||||
else return -1;
|
||||
}
|
||||
else if ( strstr( cpu_name, "Intel(R) Core(TM) i9" ) != NULL )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
else if ( strstr( cpu_name, "Intel(R) Core(TM) i7" ) != NULL )
|
||||
{
|
||||
if ( strstr( cpu_name, "7800X" ) != NULL ||
|
||||
strstr( cpu_name, "7820X" ) != NULL )
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL )
|
||||
return 2; // All i7/i9 with avx512?
|
||||
else
|
||||
{
|
||||
return -1;
|
||||
@@ -1082,3 +1113,4 @@ char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user