diff --git a/CMakeLists.txt b/CMakeLists.txt index fbbafb877d..1cbca2bb77 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) #GPU backend -if(DEVICE_BACKEND STREQUAL "HIP") - set(DEVICE_BACKEND_HIP 1) +if(DEVICE_BACKEND STREQUAL "AMD") + set(CK_DEVICE_BACKEND_AMD 1) set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP REQUIRED) -elseif(DEVICE_BACKEND STREQUAL "CUDA") - set(DEVICE_BACKEND_CUDA 1) +elseif(DEVICE_BACKEND STREQUAL "NVIDIA") + set(CK_DEVICE_BACKEND_NVIDIA 1) enable_language(CUDA) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 4a7a81a36a..f9528ae211 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,6 +1,6 @@ -if(DEVICE_BACKEND STREQUAL "HIP") +if(DEVICE_BACKEND STREQUAL "AMD") set(DRIVER_SOURCE driver.cpp) -elseif(DEVICE_BACKEND STREQUAL "CUDA") +elseif(DEVICE_BACKEND STREQUAL "NVIDIA") set(DRIVER_SOURCE driver.cu) endif() diff --git a/include/composable_kernel/utility/config_amd.hpp.in b/include/composable_kernel/utility/config_amd.hpp.in new file mode 100644 index 0000000000..d6ac44ab24 --- /dev/null +++ b/include/composable_kernel/utility/config_amd.hpp.in @@ -0,0 +1,41 @@ +#ifndef CK_CONFIG_HPP +#define CK_CONFIG_HPP + +#cmakedefine01 CK_DEVICE_BACKEND_AMD + +#include "hip/hip_runtime.h" +#include "hip/hip_fp16.h" +#define CK_USE_AMD_INLINE_ASM 1 + +namespace ck { + +// For some reason, HIP compiler need this definition to generate optimal load and store +// instruction +typedef float float2_t __attribute__((ext_vector_type(2))); +typedef float float4_t __attribute__((ext_vector_type(4))); + +using index_t = uint32_t; + +__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) +{ + d += s0 * s1; +} + +#if 0 +__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } + +__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x; + d += s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x + s0.y * s1.y; +} +#endif + +} // namespace ck + +#endif diff --git a/include/composable_kernel/utility/config.hpp.in b/include/composable_kernel/utility/config_nvidia.hpp.in similarity index 70% rename from include/composable_kernel/utility/config.hpp.in rename to include/composable_kernel/utility/config_nvidia.hpp.in index 1bd7500c12..a4d4bcaae7 100644 --- a/include/composable_kernel/utility/config.hpp.in +++ b/include/composable_kernel/utility/config_nvidia.hpp.in @@ -1,37 +1,22 @@ -#ifndef CK_CONFIG_HPP -#define CK_CONFIG_HPP +#ifndef CK_CONFIG_CUDA_HPP +#define CK_CONFIG_CUDA_HPP -#cmakedefine01 DEVICE_BACKEND_HIP -#cmakedefine01 DEVICE_BACKEND_CUDA +#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA -#if DEVICE_BACKEND_HIP -#include "hip/hip_runtime.h" -#include "hip/hip_fp16.h" -#define CK_USE_AMD_INLINE_ASM 1 - -#elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" #define CK_USE_AMD_INLINE_ASM 0 -#endif namespace ck { -#if DEVICE_BACKEND_HIP -// For some reason, HIP compiler need this definition to generate optimal load and store -// instruction -typedef float float2_t __attribute__((ext_vector_type(2))); -typedef float float4_t __attribute__((ext_vector_type(4))); -#else // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct // instruction, using float2_t = float2; using float4_t = float4; -#endif using index_t = uint32_t; @@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s // need to make a better interface __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) { -#if DEVICE_BACKEND_CUDA +#if CK_DEVICE_BACKEND_NVIDIA d = __dp4a(s0, s1, d); #endif } diff --git a/include/device.hpp b/include/device.hpp index 2c982a4012..59ab0d0bcd 100644 --- a/include/device.hpp +++ b/include/device.hpp @@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt { KernelTimer timer; -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD timer.Start(); hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...); @@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt timer.End(); hipGetErrorString(hipGetLastError()); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA const void* f = reinterpret_cast(kernel); void* p_args[] = {&args...}; diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh index 0e7d00c469..7feb67e213 100755 --- a/script/cmake-cuda.sh +++ b/script/cmake-cuda.sh @@ -12,7 +12,7 @@ cmake -D CMAKE_CXX_COMPILER=clang++ \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ --D DEVICE_BACKEND=CUDA \ +-D DEVICE_BACKEND=NVIDIA \ -D BOOST_ROOT="/package/install/boost_1.67.0" \ -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \ -D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ diff --git a/script/cmake-hip.sh b/script/cmake-hip.sh index 3880c67a17..959582ffcf 100755 --- a/script/cmake-hip.sh +++ b/script/cmake-hip.sh @@ -4,16 +4,16 @@ rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles -MY_PROJECT_SOURCE=/home/chao/code/modular_convolution +MY_PROJECT_SOURCE=../../../ MY_PROJECT_INSTALL=../install.dir cmake \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_BUILD_TYPE=Release \ --D DEVICE_BACKEND="HIP" \ --D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \ --D CMAKE_CXX_FLAGS="-gline-tables-only" \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \ +-D DEVICE_BACKEND="AMD" \ +-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v" \ +-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906" \ +-D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \ +-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ ${MY_PROJECT_SOURCE} diff --git a/script/trace.sh b/script/trace.sh new file mode 100755 index 0000000000..231a69de08 --- /dev/null +++ b/script/trace.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a0b63a179d..ef8a781dd5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,8 @@ -configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp") +if(DEVICE_BACKEND STREQUAL "AMD") + configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp") +elseif(DEVICE_BACKEND STREQUAL "NVIDIA") + configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp") +endif() set(TENSOR_SOURCE tensor.cpp; @@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE}) target_compile_features(tensor PUBLIC) set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) -if(DEVICE_BACKEND STREQUAL "CUDA") +if(DEVICE_BACKEND STREQUAL "NVIDIA") target_link_libraries(tensor nvToolsExt cudart) endif() diff --git a/src/device.cpp b/src/device.cpp index ab880b33da..ca5c22e0ab 100644 --- a/src/device.cpp +++ b/src/device.cpp @@ -3,9 +3,9 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA checkCudaErrors(cudaMalloc(static_cast(&mpDeviceBuf), mMemSize)); #endif } @@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } void DeviceMem::ToDevice(const void* p) { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipGetErrorString( hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA checkCudaErrors( cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); #endif @@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p) void DeviceMem::FromDevice(void* p) { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); #endif } DeviceMem::~DeviceMem() { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipGetErrorString(hipFree(mpDeviceBuf)); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA checkCudaErrors(cudaFree(mpDeviceBuf)); #endif } @@ -45,10 +45,10 @@ struct KernelTimerImpl { KernelTimerImpl() { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEventCreate(&mStart); hipEventCreate(&mEnd); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEventCreate(&mStart); cudaEventCreate(&mEnd); #endif @@ -56,10 +56,10 @@ struct KernelTimerImpl ~KernelTimerImpl() { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEventDestroy(mStart); hipEventDestroy(mEnd); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEventDestroy(mStart); cudaEventDestroy(mEnd); #endif @@ -67,19 +67,19 @@ struct KernelTimerImpl void Start() { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEventRecord(mStart, 0); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEventRecord(mStart, 0); #endif } void End() { -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEventRecord(mEnd, 0); hipEventSynchronize(mEnd); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEventRecord(mEnd, 0); cudaEventSynchronize(mEnd); #endif @@ -88,17 +88,17 @@ struct KernelTimerImpl float GetElapsedTime() const { float time; -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEventElapsedTime(&time, mStart, mEnd); -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEventElapsedTime(&time, mStart, mEnd); #endif return time; } -#if DEVICE_BACKEND_HIP +#if CK_DEVICE_BACKEND_AMD hipEvent_t mStart, mEnd; -#elif DEVICE_BACKEND_CUDA +#elif CK_DEVICE_BACKEND_NVIDIA cudaEvent_t mStart, mEnd; #endif };