From 291ee5f7486d1be3f4af9ff4090215dc287d733f Mon Sep 17 00:00:00 2001 From: Dave Love Date: Mon, 6 Jan 2020 20:15:48 +0000 Subject: [PATCH] Fix parsing in vpu_count on workstation SKX (#351) * Fix parsing in vpu_count on workstation SKX * Document Skylake-X as Haswell for single FMA * Update vpu_count for Skylake and Cascade Lake models * Support printing the configuration selected, controlled by the environment Intended particularly for diagnosing mis-selection of SKX through unknown, or incorrect, number of VPUs. * Move bli_log outside the cpp condition, and use it where intended * Add Fixme comment (Skylake D) * Mostly superficial edits to commits towards #351. Details: - Moved architecture/sub-config logging-related code from bli_cpuid.c to bli_arch.c, tweaked names, and added more set/get layering. - Tweaked log messages output from bli_cpuid_is_skx() in bli_cpuid.c. - Content, whitespace changes to new bullet in HardwareSupport.md that relates to single-VPU Skylake-Xs. * Fix comment typos Co-authored-by: Field G. Van Zee --- docs/HardwareSupport.md | 4 ++- frame/base/bli_arch.c | 44 ++++++++++++++++++++++++++ frame/base/bli_arch.h | 3 ++ frame/base/bli_cpuid.c | 70 ++++++++++++++++++++++++++++++----------- 4 files changed, 101 insertions(+), 20 deletions(-) diff --git a/docs/HardwareSupport.md b/docs/HardwareSupport.md index adba02f19..32e5c4a63 100644 --- a/docs/HardwareSupport.md +++ b/docs/HardwareSupport.md @@ -15,6 +15,7 @@ A few remarks / reminders: * Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic. * Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels. * Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically. + * There is a difficulty in automatically choosing the ideal sub-configuration for use on Skylake-X systems, which may have one or two FMA units. The `skx` sub-configuration is only beneficial when used on hardware with two FMA units. Otherwise the hardware is treated as a "desktop" Skylake system, which uses the `haswell` sub-configuration. Furthermore, the number of units can't be queried directly; instead, we rely on a manually-maintained list of CPU models (via logic in `frame/base/bli_cpuid.c`), which may be incorrect for new processors, particularly Gold models. In that case, you can either fix the code (and please raise an issue!) or manually target the `skx` at configure-time (i.e., `./configure [options] skx`). If your performance seems low, you can set `export BLIS_ARCH_DEBUG=1`, which will cause BLIS to output some basic debugging info to `stderr` that will reveal whether your system was detected as having one or two VPUs (FMA units). | Vendor/Microarchitecture | BLIS sub-configuration | `gemm` | `gemmtrsm` | |:-------------------------------------|:-----------------------|:-------|:-----------| @@ -28,7 +29,8 @@ A few remarks / reminders: | Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` | | Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` | | Intel Knights Landing (AVX-512/FMA3) | `knl` | `sd` | | -| Intel SkylakeX (AVX-512/FMA3) | `skx` | `sd` | | +| Intel SkylakeX (AVX-512/2×FMA3) | `skx` | `sd` | | +| Intel SkylakeX (AVX-512/1×FMA3) | `haswell` | `sdcz` | `sd` | | ARMv7 Cortex-A9 (NEON) | `cortex-a9` | `sd` | | | ARMv7 Cortex-A15 (NEON) | `cortex-a15` | `sd` | | | ARMv8 Cortex-A53 (NEON) | `cortex-a53` | `sd` | | diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 4f1f9fb93..06b23ed1a 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -74,6 +74,12 @@ void bli_arch_set_id_once( void ) void bli_arch_set_id( void ) { + // NOTE: Change this usage of getenv() to bli_env_get_var() after + // merging #351. + //bool_t do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 ); + bool_t do_logging = getenv( "BLIS_ARCH_DEBUG" ) != NULL; + bli_arch_set_logging( do_logging ); + // Architecture families. #if defined BLIS_FAMILY_INTEL64 || \ defined BLIS_FAMILY_AMD64 || \ @@ -156,6 +162,10 @@ void bli_arch_set_id( void ) id = BLIS_ARCH_GENERIC; #endif + if ( bli_arch_get_logging() ) + fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n", + bli_arch_string( id ) ); + //printf( "blis_arch_query_id(): id = %u\n", id ); //exit(1); } @@ -200,3 +210,37 @@ char* bli_arch_string( arch_t id ) return config_name[ id ]; } +// ----------------------------------------------------------------------------- + +static bool_t arch_dolog = 0; + +void bli_arch_set_logging( bool_t dolog ) +{ + arch_dolog = dolog; +} + +bool_t bli_arch_get_logging( void ) +{ + return arch_dolog; +} + +void bli_arch_log( char* fmt, ... ) +{ + char prefix[] = "libblis: "; + int n_chars = strlen( prefix ) + strlen( fmt ) + 1; + + if ( bli_arch_get_logging() && fmt ) + { + char* prefix_fmt = malloc( n_chars ); + + snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt ); + + va_list ap; + va_start( ap, fmt ); + vfprintf( stderr, prefix_fmt, ap ); + va_end( ap ); + + free( prefix_fmt ); + } +} + diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h index 6b8a38ebd..4f3f94a7e 100644 --- a/frame/base/bli_arch.h +++ b/frame/base/bli_arch.h @@ -42,6 +42,9 @@ void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); +void bli_arch_set_logging( bool_t dolog ); +bool_t bli_arch_get_logging( void ); +void bli_arch_log( char*, ... ); #endif diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 690e22304..5858c88a7 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2019, Dave Love, University of Manchester Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,6 +53,7 @@ #include "bli_cpuid.h" #else #include "blis.h" + #include "bli_arch.h" #endif // ----------------------------------------------------------------------------- @@ -167,7 +169,22 @@ bool_t bli_cpuid_is_skx int nvpu = vpu_count(); - if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) + if ( bli_cpuid_has_features( features, expected ) ) + { + switch ( nvpu ) + { + case 1: + bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" ); + return FALSE; + case 2: + bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" ); + return TRUE; + default: + bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" ); + return FALSE; + } + } + else return FALSE; return TRUE; @@ -893,6 +910,10 @@ void get_cpu_name( char *cpu_name ) *( uint32_t* )&cpu_name[32+12] = edx; } +// Return the number of FMA units _assuming avx512 is supported_. +// This needs updating for new processor types, sigh. +// See https://ark.intel.com/content/www/us/en/ark.html#@Processors +// and also https://github.com/jeffhammond/vpu-count int vpu_count( void ) { char cpu_name[48] = {}; @@ -904,49 +925,59 @@ int vpu_count( void ) if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL ) { - loc = strstr( cpu_name, "Platinum" ); + if (( loc = strstr( cpu_name, "Platinum" ) )) + return 2; if ( loc == NULL ) - loc = strstr( cpu_name, "Gold" ); + loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below if ( loc == NULL ) - loc = strstr( cpu_name, "Silver" ); + if (( loc = strstr( cpu_name, "Silver" ) )) + return 1; if ( loc == NULL ) - loc = strstr( cpu_name, "Bronze" ); + if (( loc = strstr( cpu_name, "Bronze" ) )) + return 1; if ( loc == NULL ) loc = strstr( cpu_name, "W" ); + if ( loc == NULL ) + if (( loc = strstr( cpu_name, "D" ) )) + // Fixme: May be wrong + // + return 1; if ( loc == NULL ) return -1; - loc = strstr( loc+1, " " ); + // We may have W-nnnn rather than, say, Gold nnnn + if ( 'W' == *loc && '-' == *(loc+1) ) + loc++; + else + loc = strstr( loc+1, " " ); if ( loc == NULL ) return -1; strncpy( model_num, loc+1, 4 ); - model_num[4] = '\0'; + model_num[4] = '\0'; // Things like i9-10900X matched above sku = atoi( model_num ); + // These were derived from ARK listings as of 2019-10-09, but + // may not be complete, especially as the ARK Skylake listing + // seems to be limited. if ( 8199 >= sku && sku >= 8100 ) return 2; else if ( 6199 >= sku && sku >= 6100 ) return 2; else if ( sku == 5122 ) return 2; + else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold + else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold else if ( 5199 >= sku && sku >= 5100 ) return 1; else if ( 4199 >= sku && sku >= 4100 ) return 1; else if ( 3199 >= sku && sku >= 3100 ) return 1; + else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W + else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W else if ( 2199 >= sku && sku >= 2120 ) return 2; + else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions else if ( 2119 >= sku && sku >= 2100 ) return 1; else return -1; } - else if ( strstr( cpu_name, "Intel(R) Core(TM) i9" ) != NULL ) - { - return 1; - } - else if ( strstr( cpu_name, "Intel(R) Core(TM) i7" ) != NULL ) - { - if ( strstr( cpu_name, "7800X" ) != NULL || - strstr( cpu_name, "7820X" ) != NULL ) - return 1; - else - return -1; - } + else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL ) + return 2; // All i7/i9 with avx512? else { return -1; @@ -1082,3 +1113,4 @@ char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath } #endif +