diff --git a/CMakeLists.txt b/CMakeLists.txt index eeae3d0dca..cb0508fec5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,3 +200,4 @@ enable_cppcheck( add_subdirectory(host) add_subdirectory(example) add_subdirectory(profiler) +add_subdirectory(test) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000000..c74349d76c --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,18 @@ +include_directories(BEFORE + include + ${PROJECT_SOURCE_DIR}/host/host_tensor/include + ${PROJECT_SOURCE_DIR}/host/device/include + ${PROJECT_SOURCE_DIR}/device_operation/include + ${PROJECT_SOURCE_DIR}/composable_kernel/include + ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility + ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description + ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation + ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform + ${PROJECT_SOURCE_DIR}/external/rocm/include +) + +set(MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp) + +add_executable(test_magic_number_division ${MAGIC_NUMBER_DIVISISON_SOURCE}) + +target_link_libraries(test_magic_number_division PRIVATE host_tensor) diff --git a/test/magic_number_division/main.cpp b/test/magic_number_division/main.cpp new file mode 100644 index 0000000000..7533feaa71 --- /dev/null +++ b/test/magic_number_division/main.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" + +__global__ void gpu_magic_number_division(uint32_t magic_multiplier, + uint32_t magic_shift, + const int32_t* p_dividend, + int32_t* p_result, + uint64_t num) +{ + uint64_t global_thread_num = blockDim.x * gridDim.x; + + uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num) + { + p_result[data_id] = + ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift); + } +} + +__global__ void +gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result, uint64_t num) +{ + uint64_t global_thread_num = blockDim.x * gridDim.x; + + uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num) + { + p_result[data_id] = p_dividend[data_id] / divisor; + } +} + +template +T check_error(const std::vector& ref, const std::vector& result) +{ + T error = 0; + T max_diff = 0; + T ref_value = 0, result_value = 0; + + for(std::size_t i = 0; i < ref.size(); ++i) + { + T diff = std::abs(ref[i] - result[i]); + error += diff; + + if(max_diff < diff) + { + max_diff = diff; + ref_value = ref[i]; + result_value = result[i]; + } + } + + return max_diff; +} + +int main(int, char*[]) +{ + uint64_t num_divisor = 4096; + uint64_t num_dividend = 1L << 16; + + std::vector divisors_host(num_divisor); + std::vector dividends_host(num_dividend); + + // generate divisor + for(uint64_t i = 0; i < num_divisor; ++i) + { + divisors_host[i] = i + 1; + } + + // generate dividend + for(uint64_t i = 0; i < num_divisor; ++i) + { + dividends_host[i] = i; + } + + DeviceMem dividends_dev_buf(sizeof(int32_t) * num_dividend); + DeviceMem naive_result_dev_buf(sizeof(int32_t) * num_dividend); + DeviceMem magic_result_dev_buf(sizeof(int32_t) * num_dividend); + + std::vector naive_result_host(num_dividend); + std::vector magic_result_host(num_dividend); + + dividends_dev_buf.ToDevice(dividends_host.data()); + + bool pass = true; + + for(std::size_t i = 0; i < num_divisor; ++i) + { + // run naive division on GPU + gpu_naive_division<<<1024, 256>>>( + divisors_host[i], + static_cast(dividends_dev_buf.GetDeviceBuffer()), + static_cast(naive_result_dev_buf.GetDeviceBuffer()), + num_dividend); + + // calculate magic number + uint32_t magic_multiplier, magic_shift; + + ck::tie(magic_multiplier, magic_shift) = + ck::MagicDivision::CalculateMagicNumbers(divisors_host[i]); + + // run magic division on GPU + gpu_magic_number_division<<<1024, 256>>>( + magic_multiplier, + magic_shift, + static_cast(dividends_dev_buf.GetDeviceBuffer()), + static_cast(magic_result_dev_buf.GetDeviceBuffer()), + num_dividend); + + naive_result_dev_buf.FromDevice(naive_result_host.data()); + magic_result_dev_buf.FromDevice(magic_result_host.data()); + + int32_t max_diff = check_error(naive_result_host, magic_result_host); + + if(max_diff != 0) + { + pass = false; + continue; + } + } + + if(pass) + { + std::cout << "test magic number division: Pass" << std::endl; + } + else + { + std::cout << "test magic number division: Fail" << std::endl; + } + + return 1; +}