mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
|
||||
#GPU backend
|
||||
if(DEVICE_BACKEND STREQUAL "HIP")
|
||||
set(DEVICE_BACKEND_HIP 1)
|
||||
if(DEVICE_BACKEND STREQUAL "AMD")
|
||||
set(CK_DEVICE_BACKEND_AMD 1)
|
||||
|
||||
set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
|
||||
find_package(HIP REQUIRED)
|
||||
elseif(DEVICE_BACKEND STREQUAL "CUDA")
|
||||
set(DEVICE_BACKEND_CUDA 1)
|
||||
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
set(CK_DEVICE_BACKEND_NVIDIA 1)
|
||||
|
||||
enable_language(CUDA)
|
||||
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
if(DEVICE_BACKEND STREQUAL "HIP")
|
||||
if(DEVICE_BACKEND STREQUAL "AMD")
|
||||
set(DRIVER_SOURCE driver.cpp)
|
||||
elseif(DEVICE_BACKEND STREQUAL "CUDA")
|
||||
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
set(DRIVER_SOURCE driver.cu)
|
||||
endif()
|
||||
|
||||
|
||||
41
include/composable_kernel/utility/config_amd.hpp.in
Normal file
41
include/composable_kernel/utility/config_amd.hpp.in
Normal file
@@ -0,0 +1,41 @@
|
||||
#ifndef CK_CONFIG_HPP
|
||||
#define CK_CONFIG_HPP
|
||||
|
||||
#cmakedefine01 CK_DEVICE_BACKEND_AMD
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
|
||||
namespace ck {
|
||||
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float float2_t __attribute__((ext_vector_type(2)));
|
||||
typedef float float4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
using index_t = uint32_t;
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
|
||||
|
||||
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x + s0.y * s1.y;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -1,37 +1,22 @@
|
||||
#ifndef CK_CONFIG_HPP
|
||||
#define CK_CONFIG_HPP
|
||||
#ifndef CK_CONFIG_CUDA_HPP
|
||||
#define CK_CONFIG_CUDA_HPP
|
||||
|
||||
#cmakedefine01 DEVICE_BACKEND_HIP
|
||||
#cmakedefine01 DEVICE_BACKEND_CUDA
|
||||
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#define CK_USE_AMD_INLINE_ASM 0
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float float2_t __attribute__((ext_vector_type(2)));
|
||||
typedef float float4_t __attribute__((ext_vector_type(4)));
|
||||
#else
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using float2_t = float2;
|
||||
using float4_t = float4;
|
||||
#endif
|
||||
|
||||
using index_t = uint32_t;
|
||||
|
||||
@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
|
||||
// need to make a better interface
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
|
||||
{
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
#if CK_DEVICE_BACKEND_NVIDIA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#endif
|
||||
}
|
||||
@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
timer.Start();
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
|
||||
@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
|
||||
timer.End();
|
||||
|
||||
hipGetErrorString(hipGetLastError());
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
const void* f = reinterpret_cast<const void*>(kernel);
|
||||
void* p_args[] = {&args...};
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ cmake
|
||||
-D CMAKE_CXX_COMPILER=clang++ \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D DEVICE_BACKEND=CUDA \
|
||||
-D DEVICE_BACKEND=NVIDIA \
|
||||
-D BOOST_ROOT="/package/install/boost_1.67.0" \
|
||||
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \
|
||||
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
|
||||
|
||||
@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
|
||||
MY_PROJECT_SOURCE=../../../
|
||||
MY_PROJECT_INSTALL=../install.dir
|
||||
|
||||
cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D DEVICE_BACKEND="HIP" \
|
||||
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \
|
||||
-D CMAKE_CXX_FLAGS="-gline-tables-only" \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \
|
||||
-D DEVICE_BACKEND="AMD" \
|
||||
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v" \
|
||||
-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906" \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH="/opt/rocm" \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
${MY_PROJECT_SOURCE}
|
||||
|
||||
3
script/trace.sh
Executable file
3
script/trace.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
|
||||
@@ -1,4 +1,8 @@
|
||||
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
|
||||
if(DEVICE_BACKEND STREQUAL "AMD")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
|
||||
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
|
||||
endif()
|
||||
|
||||
set(TENSOR_SOURCE
|
||||
tensor.cpp;
|
||||
@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
|
||||
target_compile_features(tensor PUBLIC)
|
||||
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
if(DEVICE_BACKEND STREQUAL "CUDA")
|
||||
if(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
target_link_libraries(tensor nvToolsExt cudart)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#endif
|
||||
}
|
||||
@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
|
||||
#endif
|
||||
@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
DeviceMem::~DeviceMem()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipFree(mpDeviceBuf));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaFree(mpDeviceBuf));
|
||||
#endif
|
||||
}
|
||||
@@ -45,10 +45,10 @@ struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventCreate(&mStart);
|
||||
hipEventCreate(&mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventCreate(&mStart);
|
||||
cudaEventCreate(&mEnd);
|
||||
#endif
|
||||
@@ -56,10 +56,10 @@ struct KernelTimerImpl
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventDestroy(mStart);
|
||||
hipEventDestroy(mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventDestroy(mStart);
|
||||
cudaEventDestroy(mEnd);
|
||||
#endif
|
||||
@@ -67,19 +67,19 @@ struct KernelTimerImpl
|
||||
|
||||
void Start()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventRecord(mStart, 0);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventRecord(mStart, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
void End()
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventRecord(mEnd, 0);
|
||||
hipEventSynchronize(mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventRecord(mEnd, 0);
|
||||
cudaEventSynchronize(mEnd);
|
||||
#endif
|
||||
@@ -88,17 +88,17 @@ struct KernelTimerImpl
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventElapsedTime(&time, mStart, mEnd);
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventElapsedTime(&time, mStart, mEnd);
|
||||
#endif
|
||||
return time;
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEvent_t mStart, mEnd;
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEvent_t mStart, mEnd;
|
||||
#endif
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user