change build

[ROCm/composable_kernel commit: c82b833d8e]
2026-07-17 00:58:44 +00:00 · 2019-06-12 10:47:25 -05:00
parent 42e9613bb8
commit 11c6b2ab9a
10 changed files with 89 additions and 56 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
 endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )

 #GPU backend
-if(DEVICE_BACKEND STREQUAL "HIP")
-    set(DEVICE_BACKEND_HIP 1)
+if(DEVICE_BACKEND STREQUAL "AMD")
+    set(CK_DEVICE_BACKEND_AMD 1)

    set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
    find_package(HIP REQUIRED)
-elseif(DEVICE_BACKEND STREQUAL "CUDA")
-    set(DEVICE_BACKEND_CUDA 1)
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    set(CK_DEVICE_BACKEND_NVIDIA 1)

 	enable_language(CUDA)
 	include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,6 +1,6 @@
-if(DEVICE_BACKEND STREQUAL "HIP")
+if(DEVICE_BACKEND STREQUAL "AMD")
    set(DRIVER_SOURCE driver.cpp)
-elseif(DEVICE_BACKEND STREQUAL "CUDA")
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
    set(DRIVER_SOURCE driver.cu)
 endif()

--- a/include/composable_kernel/utility/config_amd.hpp.in
+++ b/include/composable_kernel/utility/config_amd.hpp.in
@@ -0,0 +1,41 @@
+#ifndef CK_CONFIG_HPP
+#define CK_CONFIG_HPP
+
+#cmakedefine01 CK_DEVICE_BACKEND_AMD
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#define CK_USE_AMD_INLINE_ASM 1
+
+namespace ck {
+
+// For some reason, HIP compiler need this definition to generate optimal load and store
+// instruction
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+
+using index_t = uint32_t;
+
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
+{
+    d += s0 * s1;
+}
+
+#if 0
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
+
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+#endif
+
+} // namespace ck
+
+#endif
--- a/include/composable_kernel/utility/config_nvidia.hpp.in
+++ b/include/composable_kernel/utility/config_nvidia.hpp.in
@@ -1,37 +1,22 @@
-#ifndef CK_CONFIG_HPP
-#define CK_CONFIG_HPP
+#ifndef CK_CONFIG_CUDA_HPP
+#define CK_CONFIG_CUDA_HPP

-#cmakedefine01 DEVICE_BACKEND_HIP
-#cmakedefine01 DEVICE_BACKEND_CUDA
+#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA

-#if DEVICE_BACKEND_HIP
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
-#define CK_USE_AMD_INLINE_ASM 1
-
-#elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 #define CK_USE_AMD_INLINE_ASM 0
-#endif

 namespace ck {

-#if DEVICE_BACKEND_HIP
-// For some reason, HIP compiler need this definition to generate optimal load and store
-// instruction
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-#else
 // For some reason, CUDA need this definition, otherwise
 //   compiler won't generate optimal load and store instruction, and
 //   kernel would produce wrong result, indicating the compiler fail to generate correct
 //   instruction,
 using float2_t = float2;
 using float4_t = float4;
-#endif

 using index_t = uint32_t;

@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
 //  need to make a better interface
 __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
 {
-#if DEVICE_BACKEND_CUDA
+#if CK_DEVICE_BACKEND_NVIDIA
    d = __dp4a(s0, s1, d);
 #endif
 }
--- a/include/device.hpp
+++ b/include/device.hpp
@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
 {
    KernelTimer timer;

-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    timer.Start();

    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
    timer.End();

    hipGetErrorString(hipGetLastError());
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    const void* f  = reinterpret_cast<const void*>(kernel);
    void* p_args[] = {&args...};

--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -12,7 +12,7 @@ cmake
 -D CMAKE_CXX_COMPILER=clang++                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
-D DEVICE_BACKEND=CUDA                                                                      \
+-D DEVICE_BACKEND=NVIDIA                                                                    \
 -D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
 -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"  \
 -D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
--- a/script/cmake-hip.sh
+++ b/script/cmake-hip.sh
@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
+MY_PROJECT_SOURCE=../../../
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                       \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
-D DEVICE_BACKEND="HIP"                                                                     \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only"                                  \
-D CMAKE_CXX_FLAGS="-gline-tables-only"                                                     \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                   \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep"                             \
+-D DEVICE_BACKEND="AMD"                                                                     \
+-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v"                               \
+-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906"                              \
+-D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc                                               \
+-D CMAKE_PREFIX_PATH="/opt/rocm"                                                            \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 ${MY_PROJECT_SOURCE}
--- a/script/trace.sh
+++ b/script/trace.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,8 @@
-configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+if(DEVICE_BACKEND STREQUAL "AMD")
+    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+endif()

 set(TENSOR_SOURCE 
    tensor.cpp;
@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
 target_compile_features(tensor PUBLIC)
 set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)

-if(DEVICE_BACKEND STREQUAL "CUDA")
+if(DEVICE_BACKEND STREQUAL "NVIDIA")
    target_link_libraries(tensor nvToolsExt cudart)
 endif()

--- a/src/device.cpp
+++ b/src/device.cpp
@@ -3,9 +3,9 @@

 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
 #endif
 }
@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }

 void DeviceMem::ToDevice(const void* p)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(
        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    checkCudaErrors(
        cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
 #endif
@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)

 void DeviceMem::FromDevice(void* p)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
 #endif
 }

 DeviceMem::~DeviceMem()
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipFree(mpDeviceBuf));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    checkCudaErrors(cudaFree(mpDeviceBuf));
 #endif
 }
@@ -45,10 +45,10 @@ struct KernelTimerImpl
 {
    KernelTimerImpl()
    {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
        hipEventCreate(&mStart);
        hipEventCreate(&mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
        cudaEventCreate(&mStart);
        cudaEventCreate(&mEnd);
 #endif
@@ -56,10 +56,10 @@ struct KernelTimerImpl

    ~KernelTimerImpl()
    {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
        hipEventDestroy(mStart);
        hipEventDestroy(mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
        cudaEventDestroy(mStart);
        cudaEventDestroy(mEnd);
 #endif
@@ -67,19 +67,19 @@ struct KernelTimerImpl

    void Start()
    {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
        hipEventRecord(mStart, 0);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
        cudaEventRecord(mStart, 0);
 #endif
    }

    void End()
    {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
        hipEventRecord(mEnd, 0);
        hipEventSynchronize(mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
        cudaEventRecord(mEnd, 0);
        cudaEventSynchronize(mEnd);
 #endif
@@ -88,17 +88,17 @@ struct KernelTimerImpl
    float GetElapsedTime() const
    {
        float time;
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
        hipEventElapsedTime(&time, mStart, mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
        cudaEventElapsedTime(&time, mStart, mEnd);
 #endif
        return time;
    }

-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
    hipEvent_t mStart, mEnd;
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
    cudaEvent_t mStart, mEnd;
 #endif
 };