CMake:Added support for ADDON(aocl_gemm) on Windows

CMakelists.txt is updated to support aocl_gemm on windows.
On windows, BLIS library(blis+aocl_gemm) is built successfully
only with AOCC Compiler. (Clang has an issue with optimizing
VNNI instructions).

$cmake .. -DENABLE_ADDON="aocl_gemm" ....

AMD-Internal: [CPUPL-2748]
Change-Id: I9620878ab6934233fadc9ddc5d5e82ad85be9209
This commit is contained in:
jagar
2024-02-20 14:13:24 +05:30
committed by Jagadish1 R
parent bbfa4a88ec
commit e2de45b454
6 changed files with 88 additions and 85 deletions

View File

@@ -142,6 +142,8 @@ foreach(KERN ${KERNEL_LIST})
set(KERNEL_LIST_DEFINES "${KERNEL_LIST_DEFINES}#define BLIS_KERNELS_${UCONF}\n")
endforeach()
#------------------------------------
# Option Setting
#------------------------------------
@@ -273,8 +275,15 @@ else()
endif()
set(RENAME_BLIS_ARCH_TYPE "BLIS_ARCH_TYPE" CACHE STRING "BLIS_ARCH_TYPE env var renamed to supplied value")
set(RENAME_BLIS_MODEL_TYPE "BLIS_MODEL_TYPE" CACHE STRING "BLIS_MODEL_TYPE env var renamed to supplied value")
if(NOT WIN32)
set(ENABLE_ADDON "" CACHE STRING "Configure with specific addons using a ';'-separated list")
if(ENABLE_ADDON)
execute_process(COMMAND ${CMAKE_C_COMPILER} --version OUTPUT_VARIABLE clang_full_version_string)
string(REGEX MATCH "^[^\n]*" CLANG_VERSION_STRING "${clang_full_version_string}")
if((NOT WIN32) OR
(WIN32 AND ("${CLANG_VERSION_STRING}" MATCHES "(AMD|AOCC)")))
set(ENABLE_ADDON "" CACHE STRING "Configure with specific addons using a ';'-separated list")
else()
message(FATAL_ERROR "On Windows, aocl_gemm addon requires AOCC clang compiler.")
endif()
endif()
set(ENABLE_SANDBOX "" CACHE STRING "Enable a separate sandbox implementation of gemm.")
# Do not let ENABLE_SANDBOX appear on cmake-gui since the functionality is not yet implemented.
@@ -582,24 +591,22 @@ if((INT_TYPE_SIZE STREQUAL "32") AND (BLAS_INT_TYPE_SIZE STREQUAL "64"))
To avoid the possibility of truncation, we do not allow use of 64-bit integers in the BLAS API with 32-bit integers in BLIS. \
Please use a different configuration of integers.")
endif()
if(NOT WIN32)
cmake_print_variables(ENABLE_ADDON)
if(ENABLE_ADDON STREQUAL "")
message(" Configuring with no addons.")
set(ENABLE_ADDONS_01 0)
else()
# Remove duplicates in the addon list, if they exist.
list(REMOVE_DUPLICATES ENABLE_ADDON)
message(" Configuring with addons:")
foreach(ADDON ${ENABLE_ADDON})
message(" ${ADDON}")
if(NOT (EXISTS ${CMAKE_SOURCE_DIR}/addon/${ADDON}))
message(FATAL_ERROR "Requested addon sub-directory does not exist! Cannot continue. \
*** Please verify addon existence and name.")
endif()
endforeach()
set(ENABLE_ADDONS_01 1)
endif()
cmake_print_variables(ENABLE_ADDON)
if(ENABLE_ADDON STREQUAL "")
message(" Configuring with no addons.")
set(ENABLE_ADDONS_01 0)
else()
# Remove duplicates in the addon list, if they exist.
list(REMOVE_DUPLICATES ENABLE_ADDON)
message(" Configuring with addons:")
foreach(ADDON ${ENABLE_ADDON})
message(" ${ADDON}")
if(NOT (EXISTS ${CMAKE_SOURCE_DIR}/addon/${ADDON}))
message(FATAL_ERROR "Requested addon sub-directory does not exist! Cannot continue. \
*** Please verify addon existence and name.")
endif()
endforeach()
set(ENABLE_ADDONS_01 1)
endif()
cmake_print_variables(ENABLE_SANDBOX)
if(ENABLE_SANDBOX STREQUAL "")

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -37,9 +37,9 @@
typedef enum
{
INT8 = 0,
INT16 = 1,
INT32 = 2
LPGEMM_INT8 = 0,
LPGEMM_INT16 = 1,
LPGEMM_INT32 = 2
} AOCL_ARRAY_TYPE;
// Enum to denote the storage data type (output matrix).

View File

@@ -38,7 +38,6 @@
#include <immintrin.h>
#include <time.h>
#include <float.h>
#include <unistd.h>
#include <math.h>
#include "blis.h"
@@ -365,10 +364,10 @@ void print_result
dim_t lda,
dim_t ldb,
dim_t ldc,
double runtime
double gflops
)
{
double gflops = get_gflops( m, n, k, runtime );
//double gflops = get_gflops( m, n, k, runtime );
printf("%s transa:%c, transb:%c, m: %ld, n: %ld, k: %ld, lda: %ld, ldb: %ld, ldc: %ld," \
" Gops: %f, n_repeats: %d\n",
msg, transa, transb, m, n, k, lda, ldb, ldc, gflops, n_repeats);
@@ -397,11 +396,12 @@ void mat_mul_bench_driver_ ## BLAS_SFX \
aocl_post_op* post_op\
) \
{ \
double min_time_diff = DBL_MAX; \
double dtime; \
double dtime_save = DBL_MAX; \
\
for ( int32_t nr = 0; nr < n_repeats; ++nr ) \
{ \
struct timespec tstart={0,0}, tend={0,0}; \
clock_gettime(CLOCK_MONOTONIC, &tstart); \
dtime = bli_clock(); \
\
GEN_FUNC_NAME(mat_mul_,BLAS_SFX) \
( \
@@ -414,15 +414,12 @@ void mat_mul_bench_driver_ ## BLAS_SFX \
post_op \
); \
\
clock_gettime(CLOCK_MONOTONIC, &tend); \
dtime_save = bli_clock_min_diff( dtime_save, dtime ); \
\
double diff = \
( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \
( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \
min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \
} \
double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); \
\
print_result( XSTR(BLAS_SFX), n_repeats, transa, transb, m, n, k, lda, ldb, ldc, min_time_diff); \
print_result( XSTR(BLAS_SFX), n_repeats, transa, transb, m, n, k, lda, ldb, ldc, gflops); \
} \
GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
@@ -438,6 +435,7 @@ GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
#ifndef WIN32
int max (int a, int b)
{
return ( a > b ? a : b );
@@ -447,6 +445,7 @@ int min (int a, int b)
{
return ( a < b ? a : b );
}
#endif
#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(C_type,ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \
static inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \
@@ -1536,20 +1535,26 @@ int main( int argc, char** argv )
char ops_input_str[OPS_INPUT_STR_LEN];
// Parse CLI arguments.
opterr = 0;
int opt_val;
while ( ( opt_val = getopt( argc, argv, "i:m:n:" ) ) != -1 )
getopt_t state;
// Initialize the state for running bli_getopt(). Here, 0 is the
// initial value for opterr, which suppresses error messages.
bli_getopt_init_state( 0, &state );
int opt;
// Process all option arguments until we get a -1, which means we're done.
while( (opt = bli_getopt( argc, argv, "i:m:n:", &state )) != -1 )
{
switch ( opt_val )
char opt_ch = ( char )opt;
switch( opt_ch )
{
case 'i':
file_name = optarg;
file_name = state.optarg;
break;
case 'm':
bench_mode = ( ( ( *optarg ) == 'a' ) || ( ( *optarg ) == 'p' ) ) ? ( *optarg ) : 'p';
bench_mode = ( ( ( *state.optarg ) == 'a' ) || ( ( *state.optarg ) == 'p' ) ) ? ( *state.optarg ) : 'p';
break;
case 'n':
global_n_repeat = ( atoi( optarg ) > 0 ) ? atoi( optarg ) : 0;
global_n_repeat = ( atoi( state.optarg ) > 0 ) ? atoi( state.optarg ) : 0;
break;
default:
break;

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -37,7 +37,6 @@
#include <string.h>
#include <time.h>
#include <float.h>
#include <unistd.h>
#include <math.h>
#include "blis.h"
@@ -89,11 +88,11 @@ void gelu_bench_driver_ ## GELU_SFX \
inc_t incx \
) \
{ \
double min_time_diff = DBL_MAX; \
double dtime; \
double dtime_save = DBL_MAX; \
for ( int32_t nr = 0; nr < n_repeats; ++nr ) \
{ \
struct timespec tstart={0,0}, tend={0,0}; \
clock_gettime(CLOCK_MONOTONIC, &tstart); \
dtime = bli_clock(); \
\
if ( bench_mode == 'a' ) \
{ \
@@ -105,15 +104,11 @@ void gelu_bench_driver_ ## GELU_SFX \
n, x, incx \
); \
\
clock_gettime(CLOCK_MONOTONIC, &tend); \
dtime_save = bli_clock_min_diff( dtime_save, dtime ); \
\
double diff = \
( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \
( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \
min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \
} \
\
print_result( XSTR(GELU_SFX), n_repeats, n, incx, min_time_diff); \
print_result( XSTR(GELU_SFX), n_repeats, n, incx, dtime_save); \
} \
GEN_GELU_BENCH_DRV_FN(float,gelu_tanh_f32)
@@ -128,11 +123,11 @@ void softmax_bench_driver_ ## SOFTMAX_SFX \
inc_t incx \
) \
{ \
double min_time_diff = DBL_MAX; \
double dtime; \
double dtime_save = DBL_MAX; \
for ( int32_t nr = 0; nr < n_repeats; ++nr ) \
{ \
struct timespec tstart={0,0}, tend={0,0}; \
clock_gettime(CLOCK_MONOTONIC, &tstart); \
dtime = bli_clock(); \
\
if ( bench_mode == 'a' ) \
{ \
@@ -144,15 +139,10 @@ void softmax_bench_driver_ ## SOFTMAX_SFX \
n, x, incx \
); \
\
clock_gettime(CLOCK_MONOTONIC, &tend); \
\
double diff = \
( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \
( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \
min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \
dtime_save = bli_clock_min_diff( dtime_save, dtime ); \
} \
\
print_result( XSTR(SOFTMAX_SFX), n_repeats, n, incx, min_time_diff); \
print_result( XSTR(SOFTMAX_SFX), n_repeats, n, incx, dtime_save); \
} \
GEN_SOFTMAX_BENCH_DRV_FN(float,softmax_f32)
@@ -323,22 +313,26 @@ int main( int argc, char** argv )
}
char* file_name = NULL;
getopt_t state;
// Initialize the state for running bli_getopt(). Here, 0 is the
// initial value for opterr, which suppresses error messages.
bli_getopt_init_state( 0, &state );
// Parse CLI arguments.
opterr = 0;
int opt_val;
while ( ( opt_val = getopt( argc, argv, "i:m:n:" ) ) != -1 )
int opt;
// Process all option arguments until we get a -1, which means we're done.
while( (opt = bli_getopt( argc, argv, "i:m:n:", &state )) != -1 )
{
switch ( opt_val )
char opt_ch = ( char )opt;
switch( opt_ch )
{
case 'i':
file_name = optarg;
file_name = state.optarg;
break;
case 'm':
bench_mode = ( ( ( *optarg ) == 'a' ) || ( ( *optarg ) == 'p' ) ) ? ( *optarg ) : 'p';
bench_mode = ( ( ( *state.optarg ) == 'a' ) || ( ( *state.optarg ) == 'p' ) ) ? ( *state.optarg ) : 'p';
break;
case 'n':
global_n_repeat = ( atoi( optarg ) > 0 ) ? atoi( optarg ) : 0;
global_n_repeat = ( atoi( state.optarg ) > 0 ) ? atoi( state.optarg ) : 0;
break;
default:
break;

View File

@@ -81,13 +81,16 @@ if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
string(REGEX MATCHALL "(AOCC_2|AOCC_3|AOCC_4|AOCC|LLVM|clang)" CLANG_STRING "${CLANG_VERSION_STRING}")
string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION "${CLANG_VERSION_STRING}")
if(NOT WIN32)
set(alignloops "-falign-loops=64")
endif()
if("${CLANG_STRING}" MATCHES "AOCC_4")
# AOCC version 4x we will enable znver4
list(APPEND CKVECFLAGS -march=znver4 -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver4 ${alignloops})
list(APPEND CRVECFLAGS -march=znver4)
elseif("${CLANG_STRING}" MATCHES "AOCC_3")
# AOCC version 3x we will enable znver3
list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 ${alignloops})
list(APPEND CRVECFLAGS -march=znver3)
elseif("${CLANG_STRING}" MATCHES "(AOCC_2|LLVM)")
# AOCC version 2x we will enable znver2
@@ -95,18 +98,18 @@ if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
list(APPEND CRVECFLAGS -march=znver2)
elseif(CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
# LLVM clang 16.0 or later
list(APPEND CKVECFLAGS -march=znver4 -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver4 ${alignloops})
list(APPEND CRVECFLAGS -march=znver4)
elseif(CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0.0)
# LLVM clang 13.0 or later
list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 ${alignloops})
list(APPEND CRVECFLAGS -march=znver3)
elseif(CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0.0)
# LLVM clang 9.0 or later
list(APPEND CKVECFLAGS -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 ${alignloops})
list(APPEND CRVECFLAGS -march=znver2)
else()
list(APPEND CKVECFLAGS -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -falign-loops=64)
list(APPEND CKVECFLAGS -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni ${alignloops})
list(APPEND CRVECFLAGS -march=znver1)
endif()
endif()

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -200,13 +200,7 @@ extern "C" {
// -- addon definitions --
// NOTE: These definitions should not be included much earlier since an addon
// may wish to utilize other types and definitions provided by BLIS.
// TODO: Disable addon header file inclusion for windows since configure
// script is not executed, and subsequently the header file ie not generated.
#if !defined(_WIN32) && !defined(__CYGWIN__)
#include "bli_addon.h"
#endif
// -- sandbox implementation --