mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-05 20:55:59 +00:00
Add host API (#220)
* Add host API * manually rebase on develop * clean * manually rebase on develop * exclude tests from all target * address review comments * update client app name * fix missing lib name * clang-format update * refactor * refactor * refactor * refactor * refactor * fix test issue * refactor * refactor * refactor * upate cmake and readme Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
|
||||
|
||||
## OpenMP
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
# workaround issue hipcc in rocm3.5 cannot find openmp
|
||||
@@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
|
||||
|
||||
|
||||
rocm_create_package(
|
||||
NAME CK-${CK_BACKEND}
|
||||
NAME composablekernel
|
||||
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
|
||||
MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
|
||||
LDCONFIG
|
||||
)
|
||||
|
||||
@@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
|
||||
|
||||
configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
|
||||
configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
|
||||
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
@@ -234,7 +237,6 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/library/include
|
||||
)
|
||||
|
||||
include(googletest)
|
||||
|
||||
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
|
||||
if(BUILD_DEV)
|
||||
@@ -247,3 +249,25 @@ add_subdirectory(library)
|
||||
add_subdirectory(example)
|
||||
add_subdirectory(test)
|
||||
add_subdirectory(profiler)
|
||||
|
||||
#Create an interface target for the include only files and call it "composablekernels"
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(version 1.0.0)
|
||||
write_basic_package_version_file(
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
|
||||
VERSION "${version}"
|
||||
COMPATIBILITY AnyNewerVersion
|
||||
)
|
||||
|
||||
configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
NO_CHECK_REQUIRED_COMPONENTS_MACRO
|
||||
)
|
||||
|
||||
install(FILES
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
)
|
||||
|
||||
11
Config.cmake.in
Normal file
11
Config.cmake.in
Normal file
@@ -0,0 +1,11 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set(_composable_kernel_supported_components device_operations host_tensor)
|
||||
|
||||
foreach(_comp ${composable_kernel_FIND_COMPONENTS})
|
||||
if(NOT _comp IN_LIST _composable_kernel_supported_components)
|
||||
set(composable_kernel_FOUND False)
|
||||
set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
|
||||
endif()
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
|
||||
endforeach()
|
||||
15
Dockerfile
15
Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y wget gnupg
|
||||
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
||||
RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
|
||||
echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
|
||||
sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
|
||||
cat /etc/apt/sources.list.d/rocm.list;\
|
||||
else \
|
||||
sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
|
||||
fi
|
||||
RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
|
||||
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
|
||||
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
|
||||
|
||||
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
|
||||
# Install dependencies
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
|
||||
apt-utils \
|
||||
sshpass \
|
||||
build-essential \
|
||||
cmake-data=3.15.1-0kitware1 \
|
||||
cmake=3.15.1-0kitware1 \
|
||||
curl \
|
||||
doxygen \
|
||||
g++ \
|
||||
gdb \
|
||||
git \
|
||||
hip-rocclr \
|
||||
jq \
|
||||
lcov \
|
||||
libelf-dev \
|
||||
libncurses5-dev \
|
||||
libnuma-dev \
|
||||
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# RUN pip3 install --default-timeout=100000 -r requirements.txt
|
||||
|
||||
# Setup ubsan environment to printstacktrace
|
||||
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
|
||||
ENV UBSAN_OPTIONS=print_stacktrace=1
|
||||
@@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
|
||||
ADD dev-requirements.txt dev-requirements.txt
|
||||
RUN rbuild prepare -s develop -d $PREFIX
|
||||
RUN groupadd -f render
|
||||
# RUN cget install -f min-requirements.txt
|
||||
# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
|
||||
|
||||
19
Jenkinsfile
vendored
19
Jenkinsfile
vendored
@@ -320,7 +320,7 @@ pipeline {
|
||||
{
|
||||
agent{ label rocmnode("gfx908")}
|
||||
environment{
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
|
||||
@@ -341,6 +341,23 @@ pipeline {
|
||||
|
||||
}
|
||||
}
|
||||
stage("Client App")
|
||||
{
|
||||
parallel
|
||||
{
|
||||
stage("Run Client App")
|
||||
{
|
||||
agent{ label rocmnode("gfx908")}
|
||||
environment{
|
||||
setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
|
||||
execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Performance Tests")
|
||||
{
|
||||
parallel
|
||||
|
||||
10
README.md
10
README.md
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
|
||||
make -j ckProfiler
|
||||
```
|
||||
Instructions for running ckProfiler are under ```profiler/```
|
||||
|
||||
|
||||
## Caveat
|
||||
### Kernel Timing and Verification
|
||||
CK's own kernel timer will warn up kernel once, and then run it multiple times
|
||||
to get average kernel time. For some kernels that use atomic add, this will cause
|
||||
output buffer to be accumulated multiple times, causing verfication failure.
|
||||
To work around it, do not use CK's own timer and do verification at the same time.
|
||||
CK's own timer and verification in each example and ckProfiler can be enabled or
|
||||
disabled from command line.
|
||||
|
||||
@@ -19,6 +19,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
|
||||
-Wno-zero-as-null-pointer-constant
|
||||
-Wno-unused-member-function
|
||||
-Wno-comma
|
||||
-Wno-old-style-cast
|
||||
)
|
||||
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
|
||||
|
||||
@@ -35,4 +36,4 @@ FetchContent_MakeAvailable(googletest)
|
||||
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
|
||||
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
|
||||
target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
|
||||
|
||||
target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
|
||||
|
||||
@@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -73,13 +73,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -194,7 +194,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D<AD
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -106,13 +106,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 6)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
alpha = std::stof(argv[4]);
|
||||
beta = std::stof(argv[5]);
|
||||
@@ -121,7 +121,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -138,7 +138,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -206,7 +206,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
|
||||
@@ -83,9 +83,9 @@ using ReferenceGemmInstance =
|
||||
CElementOp>;
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 11)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -122,7 +122,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -218,7 +218,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
|
||||
|
||||
@@ -93,7 +93,7 @@ void PrintUseMsg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "Following arguments:\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
const int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if(argc >= 5)
|
||||
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])
|
||||
"not support this problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -90,7 +90,7 @@ void PrintUseMsg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "Following arguments:\n"
|
||||
<< " N, K, C, \n"
|
||||
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
const int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if(argc >= 5)
|
||||
@@ -280,7 +280,7 @@ int main(int argc, char* argv[])
|
||||
"not support this problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -107,7 +107,7 @@ void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
@@ -276,7 +276,7 @@ int main(int argc, char* argv[])
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -110,7 +110,7 @@ void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
@@ -277,7 +277,7 @@ int main(int argc, char* argv[])
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -112,7 +112,7 @@ void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::utils::conv;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// Conv shape
|
||||
ck::index_t N = 128;
|
||||
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 19)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(0);
|
||||
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
int do_log = 0;
|
||||
int split_k = 4;
|
||||
|
||||
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
do_log = std::stoi(argv[4]);
|
||||
split_k = std::stoi(argv[5]);
|
||||
}
|
||||
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
do_log = std::stoi(argv[4]);
|
||||
split_k = std::stoi(argv[5]);
|
||||
|
||||
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4: is show log (0=no, 1=yes)\n");
|
||||
printf("arg5: split-k \n");
|
||||
printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -116,10 +116,9 @@ class SimpleAppArgs
|
||||
std::vector<size_t> inLengths;
|
||||
std::vector<float> scales;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
int init_method = 1;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
@@ -135,7 +134,7 @@ class SimpleAppArgs
|
||||
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
|
||||
std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
@@ -182,7 +181,7 @@ class SimpleAppArgs
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
nrepeat = std::atoi(argv[optind]);
|
||||
time_kernel = std::atoi(argv[optind]);
|
||||
|
||||
if(scales.empty())
|
||||
{
|
||||
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
auto invoker_ptr = reduce.MakeInvokerPointer();
|
||||
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
|
||||
|
||||
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
|
||||
invariant_total_length * sizeof(OutDataType);
|
||||
|
||||
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::host_reduce;
|
||||
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// Pool shape
|
||||
ck::index_t N = 128;
|
||||
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 16)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
N = std::stoi(argv[4]);
|
||||
C = std::stoi(argv[5]);
|
||||
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(0);
|
||||
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
|
||||
"not support this problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
|
||||
|
||||
|
||||
@@ -105,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -125,13 +125,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -145,7 +145,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, nrepeat);
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
|
||||
@@ -58,9 +58,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 1;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
int nrepeat = 5;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -79,13 +79,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -99,7 +99,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -192,30 +192,13 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
// warm up
|
||||
invoker.Run(argument);
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
// timing
|
||||
float total_time = 0;
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
KernelTimer timer;
|
||||
|
||||
timer.Start();
|
||||
|
||||
invoker.Run(argument);
|
||||
|
||||
timer.End();
|
||||
|
||||
total_time += timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
float ave_time = total_time / nrepeat;
|
||||
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
|
||||
// will not be correct. need to set time_kernel = false for correctness test
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
|
||||
@@ -87,7 +87,7 @@ void print_use_msg()
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
|
||||
<< "arg3: run kernel # of times (>1)\n"
|
||||
<< "arg3: time kernel (0=n0, 1=yes)\n"
|
||||
<< "arg4: N spatial dimensions (default 2)\n"
|
||||
<< "Following arguments (depending on number of spatial dims):\n"
|
||||
<< " N, K, C, \n"
|
||||
@@ -165,9 +165,9 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 0;
|
||||
int init_method = 0;
|
||||
int nrepeat = 5;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
int num_dim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParams params;
|
||||
@@ -177,13 +177,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc > 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
num_dim_spatial = std::stoi(argv[4]);
|
||||
// check args number
|
||||
int conv_args = 3 + num_dim_spatial * 6;
|
||||
@@ -284,7 +284,7 @@ int main(int argc, char* argv[])
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker->Run(argument.get(), nrepeat);
|
||||
float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = ck::utils::conv::get_flops(
|
||||
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
|
||||
|
||||
@@ -57,9 +57,9 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = 1;
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
int nrepeat = 5;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
@@ -80,13 +80,13 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 11)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
nrepeat = std::stoi(argv[3]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: run kernel # of times (>1)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
|
||||
exit(0);
|
||||
}
|
||||
@@ -204,30 +204,13 @@ int main(int argc, char* argv[])
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
// warm up
|
||||
invoker.Run(argument);
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
// timing
|
||||
float total_time = 0;
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
KernelTimer timer;
|
||||
|
||||
timer.Start();
|
||||
|
||||
invoker.Run(argument);
|
||||
|
||||
timer.End();
|
||||
|
||||
total_time += timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
float ave_time = total_time / nrepeat;
|
||||
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
|
||||
// will not be correct. need to set time_kernel = false for correctness test
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
|
||||
std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// "_PACKAGE_" to avoid name contentions: the macros like
|
||||
// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
|
||||
// clang-format off
|
||||
#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
|
||||
#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
|
||||
#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
|
||||
// clang-format on
|
||||
|
||||
#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
|
||||
#define CK_HIP_PACKAGE_VERSION_MAJOR 0
|
||||
#endif
|
||||
#ifndef CK_HIP_PACKAGE_VERSION_MINOR
|
||||
#define CK_HIP_PACKAGE_VERSION_MINOR 0
|
||||
#endif
|
||||
#ifndef CK_HIP_PACKAGE_VERSION_PATCH
|
||||
#define CK_HIP_PACKAGE_VERSION_PATCH 0
|
||||
#endif
|
||||
// 3 decimal digits for major and minor, 6 digits for patch number.
|
||||
// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
|
||||
#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
|
||||
CK_HIP_PACKAGE_VERSION_PATCH > 999999
|
||||
#error "Too big HIP version number(s)"
|
||||
#endif
|
||||
#define CK_HIP_PACKAGE_VERSION_FLAT \
|
||||
((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
|
||||
CK_HIP_PACKAGE_VERSION_PATCH)
|
||||
3
include/ck/options.hpp.in
Normal file
3
include/ck/options.hpp.in
Normal file
@@ -0,0 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#cmakedefine01 CK_TIME_KERNEL
|
||||
10
include/ck/stream_config.hpp
Normal file
10
include/ck/stream_config.hpp
Normal file
@@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
struct StreamConfig
|
||||
{
|
||||
hipStream_t stream_id_ = nullptr;
|
||||
bool time_kernel_ = false;
|
||||
};
|
||||
@@ -1,8 +1,9 @@
|
||||
#ifndef DEVICE_BASE_HPP
|
||||
#define DEVICE_BASE_HPP
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
@@ -22,7 +23,10 @@ struct BaseInvoker
|
||||
BaseInvoker(const BaseInvoker&) = default;
|
||||
BaseInvoker& operator=(const BaseInvoker&) = default;
|
||||
|
||||
virtual float Run(const BaseArgument*, int = 1) = 0;
|
||||
virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
|
||||
{
|
||||
return float{0};
|
||||
}
|
||||
|
||||
virtual ~BaseInvoker() {}
|
||||
};
|
||||
@@ -33,8 +37,8 @@ struct BaseOperator
|
||||
BaseOperator(const BaseOperator&) = default;
|
||||
BaseOperator& operator=(const BaseOperator&) = default;
|
||||
|
||||
virtual bool IsSupportedArgument(const BaseArgument*) = 0;
|
||||
virtual std::string GetTypeString() const = 0;
|
||||
virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
|
||||
virtual std::string GetTypeString() const { return ""; }
|
||||
|
||||
virtual ~BaseOperator() {}
|
||||
};
|
||||
@@ -42,4 +46,3 @@ struct BaseOperator
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -693,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int /* nrepeat */ = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -729,6 +729,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
|
||||
const auto K =
|
||||
arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
|
||||
|
||||
float elapsed_time = 0.0f;
|
||||
if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
|
||||
{
|
||||
const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
|
||||
@@ -748,26 +749,28 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
true>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.BatchCount_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.compute_base_ptr_of_batch_,
|
||||
arg.block_2_ctile_map_);
|
||||
elapsed_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.BatchCount_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.compute_base_ptr_of_batch_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -788,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
false>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.BatchCount_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.compute_base_ptr_of_batch_,
|
||||
arg.block_2_ctile_map_);
|
||||
elapsed_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.BatchCount_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.compute_base_ptr_of_batch_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
|
||||
return 0;
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl
|
||||
{
|
||||
using Argument = DeviceBatchedGemmXdl::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
@@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
ShowInfo(arg);
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_m_n_,
|
||||
@@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
|
||||
float ave_time = 0;
|
||||
|
||||
const auto Run = [&](const auto& kernel) {
|
||||
if(nrepeat > 0)
|
||||
{
|
||||
ave_time =
|
||||
launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
hipGetErrorString(hipMemset(
|
||||
arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
if(kbatch > 1 || nrepeat <= 0)
|
||||
{
|
||||
hipGetErrorString(hipMemset(
|
||||
arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
};
|
||||
|
||||
if(has_main_k0_block_loop)
|
||||
@@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
float ave_time = 0;
|
||||
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
|
||||
@@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
true>;
|
||||
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
false>;
|
||||
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -642,7 +642,7 @@ struct
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -727,8 +727,8 @@ struct
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -771,8 +771,8 @@ struct
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -795,9 +795,10 @@ struct
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto naive_conv3d_fwd =
|
||||
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
|
||||
@@ -103,8 +103,8 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>;
|
||||
|
||||
float ave_time = launch_and_time_kernel(naive_conv3d_fwd,
|
||||
nrepeat,
|
||||
float ave_time = launch_and_time_kernel(stream_config,
|
||||
naive_conv3d_fwd,
|
||||
dim3(256),
|
||||
dim3(256),
|
||||
0,
|
||||
@@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
{
|
||||
std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
|
||||
@@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
|
||||
OutElementwiseOperation,
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
true>;
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
|
||||
remove_reference_t<Block2CTileMap>,
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
float ave_time = 0;
|
||||
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
|
||||
@@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
|
||||
true>;
|
||||
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
|
||||
false>;
|
||||
|
||||
ave_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -503,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int /* nrepeat */ = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -536,6 +536,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
|
||||
const auto K =
|
||||
arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
|
||||
|
||||
float elapsed_time = 0.0f;
|
||||
if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
|
||||
{
|
||||
const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
|
||||
@@ -554,24 +555,26 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
|
||||
typename GridwiseGemm::DefaultBlock2CTileMap,
|
||||
true>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
elapsed_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -591,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
|
||||
typename GridwiseGemm::DefaultBlock2CTileMap,
|
||||
false>;
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
elapsed_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.p_d0_grid_,
|
||||
arg.p_d1_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.d1_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.d_grid_desc_mblock_mperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
|
||||
return 0;
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -290,7 +290,7 @@ struct DeviceGemmXdl
|
||||
{
|
||||
using Argument = DeviceGemmXdl::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -339,8 +339,8 @@ struct DeviceGemmXdl
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -370,8 +370,8 @@ struct DeviceGemmXdl
|
||||
remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -391,9 +391,10 @@ struct DeviceGemmXdl
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
|
||||
{
|
||||
using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
@@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
@@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
@@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
|
||||
true>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
|
||||
false>;
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config,
|
||||
kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -440,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle
|
||||
{
|
||||
using Argument = DeviceOp::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
@@ -487,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle
|
||||
typename GridwiseGemm::DefaultBlock2CTileMap,
|
||||
true>;
|
||||
|
||||
if(nrepeat == 0)
|
||||
{
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
else
|
||||
{
|
||||
ave_time =
|
||||
launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
ave_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -538,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle
|
||||
typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
typename GridwiseGemm::DefaultBlock2CTileMap,
|
||||
false>;
|
||||
|
||||
if(nrepeat == 0)
|
||||
{
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
else
|
||||
{
|
||||
ave_time =
|
||||
launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
ave_time =
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK
|
||||
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
ShowInfo(arg);
|
||||
|
||||
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
@@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK
|
||||
float ave_time = 0;
|
||||
|
||||
const auto Run = [&](const auto& kernel) {
|
||||
if(nrepeat > 0)
|
||||
{
|
||||
ShowInfo(arg);
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
// FIXME: this should be moved outside of DeviceOp
|
||||
hipGetErrorString(
|
||||
hipMemset(arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
if(kbatch > 1 || nrepeat <= 0)
|
||||
{
|
||||
hipGetErrorString(
|
||||
hipMemset(arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
};
|
||||
|
||||
if(has_main_k0_block_loop)
|
||||
{
|
||||
if(kbatch == 1)
|
||||
@@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle
|
||||
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
ShowInfo(arg);
|
||||
|
||||
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
@@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle
|
||||
float ave_time = 0;
|
||||
|
||||
const auto Run = [&](const auto& kernel) {
|
||||
if(nrepeat > 0)
|
||||
{
|
||||
ShowInfo(arg);
|
||||
ave_time =
|
||||
launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
hipGetErrorString(hipMemset(
|
||||
arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
if(kbatch > 1 || nrepeat <= 0)
|
||||
{
|
||||
hipGetErrorString(hipMemset(
|
||||
arg.p_c_grid_,
|
||||
0,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
|
||||
sizeof(CDataType)));
|
||||
|
||||
launch_kernel(kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
}
|
||||
launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.p_a_grid_,
|
||||
arg.p_b_grid_,
|
||||
arg.p_c_grid_,
|
||||
arg.a_grid_desc_kbatch_k0_m_k1_,
|
||||
arg.b_grid_desc_kbatch_k0_n_k1_,
|
||||
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.c_element_op_,
|
||||
arg.block_2_ctile_map_);
|
||||
};
|
||||
|
||||
if(has_main_k0_block_loop)
|
||||
{
|
||||
if(kbatch == 1)
|
||||
@@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl
|
||||
{
|
||||
using Argument = DeviceGroupedGemmXdl::Argument;
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
StaticallyIndexedArray<GemmDescKernelArg, MaxGroupCount> gemm_desc_kernel_args;
|
||||
|
||||
@@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl
|
||||
true,
|
||||
MaxGroupCount>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.grid_size_),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl
|
||||
false,
|
||||
MaxGroupCount>;
|
||||
|
||||
ave_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
ave_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.grid_size_),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl
|
||||
}
|
||||
|
||||
// polymorphic
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
|
||||
OutDataType,
|
||||
@@ -241,8 +241,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
|
||||
|
||||
const index_t grid_size = (ReduceM / ReduceM_BlockTileSize);
|
||||
|
||||
return launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
return launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -257,9 +257,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
|
||||
arg.p_out_indices_dev_);
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto in_grid_desc_m_k =
|
||||
DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
|
||||
@@ -253,8 +253,8 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
avg_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_);
|
||||
@@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
avg_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
|
||||
@@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
|
||||
@@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
KernelTimer timer;
|
||||
|
||||
const auto kernel_pre = kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M>;
|
||||
const auto kernel_main = kernel_reduce_multiblock_atocmi_add<GridwiseReduce,
|
||||
InDataType,
|
||||
@@ -287,50 +285,38 @@ struct DeviceReduceMultiBlockAtomicAdd
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n",
|
||||
arg.gridSize,
|
||||
BlockSize);
|
||||
printf("Warm up\n");
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kernel_pre,
|
||||
dim3(arg.gridSize_pre),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
out_grid_desc_m,
|
||||
arg.out_dev_,
|
||||
static_cast<OutDataType>(0.0f));
|
||||
|
||||
for(int i = 0; i < nrepeat + 1; i++)
|
||||
{
|
||||
if(i == 1)
|
||||
timer.Start();
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kernel_main,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.blkGroupSize,
|
||||
arg.kBlockTileIterations,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.out_dev_);
|
||||
|
||||
launch_kernel(kernel_pre,
|
||||
dim3(arg.gridSize_pre),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
out_grid_desc_m,
|
||||
arg.out_dev_,
|
||||
static_cast<OutDataType>(0.0f));
|
||||
return avg_time;
|
||||
}
|
||||
|
||||
launch_kernel(kernel_main,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.blkGroupSize,
|
||||
arg.kBlockTileIterations,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.out_dev_);
|
||||
};
|
||||
|
||||
timer.End();
|
||||
|
||||
avg_time = timer.GetElapsedTime() / nrepeat;
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
|
||||
@@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
|
||||
@@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
avg_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
|
||||
@@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
const auto in_grid_desc_m_k =
|
||||
DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
|
||||
@@ -254,8 +254,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
InElementwiseOperation,
|
||||
OutElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
avg_time = launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
@@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
float Run(const BaseArgument* p_arg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
|
||||
54
library/include/ck/library/host/host_interface.hpp
Normal file
54
library/include/ck/library/host/host_interface.hpp
Normal file
@@ -0,0 +1,54 @@
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device_base.hpp"
|
||||
|
||||
struct DeviceConvFwdPtr_t
|
||||
{
|
||||
using BaseArgument = ck::tensor_operation::device::BaseArgument;
|
||||
using BaseInvoker = ck::tensor_operation::device::BaseInvoker;
|
||||
|
||||
struct DeviceConvFwdPtrImpl;
|
||||
std::unique_ptr<DeviceConvFwdPtrImpl> pImpl;
|
||||
DeviceConvFwdPtr_t();
|
||||
~DeviceConvFwdPtr_t();
|
||||
DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
|
||||
DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
|
||||
DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
|
||||
DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads)
|
||||
const; // in,wei and out element ops are ignored for now since even if we change them, they
|
||||
// cant be linked
|
||||
std::unique_ptr<BaseInvoker>
|
||||
MakeInvokerPointer() const; // requires including BaseInvoker headers
|
||||
std::string GetTypeString();
|
||||
bool IsSupportedArgument(const BaseArgument* arg_ptr);
|
||||
};
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
@@ -1,12 +1,25 @@
|
||||
#ifndef DEVICE_HPP
|
||||
#define DEVICE_HPP
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "ck/options.hpp"
|
||||
|
||||
inline void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
|
||||
<< "in function: " << __func__;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
struct DeviceMem
|
||||
{
|
||||
@@ -36,49 +49,59 @@ struct KernelTimer
|
||||
std::unique_ptr<KernelTimerImpl> impl;
|
||||
};
|
||||
|
||||
using device_stream_t = hipStream_t;
|
||||
|
||||
template <typename... Args, typename F>
|
||||
void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
float launch_and_time_kernel(const StreamConfig& stream_config,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
{
|
||||
hipStream_t stream_id = nullptr;
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F>
|
||||
float launch_and_time_kernel(
|
||||
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
||||
__func__,
|
||||
grid_dim.x,
|
||||
grid_dim.y,
|
||||
grid_dim.z,
|
||||
block_dim.x,
|
||||
block_dim.y,
|
||||
block_dim.z);
|
||||
|
||||
printf("Warm up\n");
|
||||
|
||||
hipStream_t stream_id = nullptr;
|
||||
|
||||
// warm up
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
|
||||
timer.Start();
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
#if CK_TIME_KERNEL
|
||||
if(stream_config.time_kernel_)
|
||||
{
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
||||
__func__,
|
||||
grid_dim.x,
|
||||
grid_dim.y,
|
||||
grid_dim.z,
|
||||
block_dim.x,
|
||||
block_dim.y,
|
||||
block_dim.z);
|
||||
|
||||
const int nrepeat = 10;
|
||||
|
||||
printf("Warm up 1 time\n");
|
||||
|
||||
// warm up
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
|
||||
KernelTimer timer;
|
||||
timer.Start();
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
}
|
||||
|
||||
timer.End();
|
||||
|
||||
return timer.GetElapsedTime() / nrepeat;
|
||||
}
|
||||
else
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
timer.End();
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
return timer.GetElapsedTime() / nrepeat;
|
||||
}
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
|
||||
}
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#ifndef REFERENCE_CONV_FWD_HPP
|
||||
#define REFERENCE_CONV_FWD_HPP
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <type_traits>
|
||||
#include <sstream>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "device_base.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
|
||||
}
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
|
||||
} // namespace host
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
|
||||
|
||||
template <typename OpInstancePtr>
|
||||
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
|
||||
int nrepeat = 100,
|
||||
bool time_kernel = false,
|
||||
bool do_verification = false,
|
||||
bool do_log = false)
|
||||
{
|
||||
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
|
||||
if(op_ptr->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
float avg_time = invoker->Run(argument.get(), nrepeat);
|
||||
float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flops = op_instance_.GetFlops();
|
||||
std::size_t num_btype = op_instance_.GetBtype();
|
||||
|
||||
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
|
||||
host_tensor.cpp
|
||||
)
|
||||
|
||||
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
|
||||
add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
|
||||
add_library(composable_kernel::host_tensor ALIAS host_tensor)
|
||||
|
||||
target_compile_features(host_tensor PUBLIC)
|
||||
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
install(TARGETS host_tensor LIBRARY DESTINATION lib)
|
||||
|
||||
target_include_directories(host_tensor PUBLIC
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
|
||||
)
|
||||
|
||||
install(TARGETS host_tensor
|
||||
EXPORT host_tensorTargets
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
)
|
||||
|
||||
install(EXPORT host_tensorTargets
|
||||
FILE composable_kernelhost_tensorTargets.cmake
|
||||
NAMESPACE composable_kernel::
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
)
|
||||
|
||||
clang_tidy_check(host_tensor)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); }
|
||||
void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
|
||||
|
||||
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
|
||||
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
|
||||
|
||||
struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
hipGetErrorString(hipEventCreate(&mStart));
|
||||
hipGetErrorString(hipEventCreate(&mEnd));
|
||||
hip_check_error(hipEventCreate(&mStart));
|
||||
hip_check_error(hipEventCreate(&mEnd));
|
||||
}
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
hipGetErrorString(hipEventDestroy(mStart));
|
||||
hipGetErrorString(hipEventDestroy(mEnd));
|
||||
hip_check_error(hipEventDestroy(mStart));
|
||||
hip_check_error(hipEventDestroy(mEnd));
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
hipGetErrorString(hipDeviceSynchronize());
|
||||
hipGetErrorString(hipEventRecord(mStart, nullptr));
|
||||
hip_check_error(hipDeviceSynchronize());
|
||||
hip_check_error(hipEventRecord(mStart, nullptr));
|
||||
}
|
||||
|
||||
void End()
|
||||
{
|
||||
hipGetErrorString(hipEventRecord(mEnd, nullptr));
|
||||
hipGetErrorString(hipEventSynchronize(mEnd));
|
||||
hip_check_error(hipEventRecord(mEnd, nullptr));
|
||||
hip_check_error(hipEventSynchronize(mEnd));
|
||||
}
|
||||
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
|
||||
hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
|
||||
return time;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
@@ -18,7 +19,7 @@ include_directories(BEFORE
|
||||
|
||||
function(add_instance_library INSTANCE_NAME)
|
||||
message("adding instance ${INSTANCE_NAME}")
|
||||
add_library(${INSTANCE_NAME} SHARED ${ARGN})
|
||||
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
|
||||
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
||||
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endfunction(add_instance_library INSTANCE_NAME)
|
||||
@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data)
|
||||
add_subdirectory(grouped_gemm)
|
||||
add_subdirectory(conv2d_bwd_weight)
|
||||
add_subdirectory(batched_gemm_reduce)
|
||||
|
||||
add_library(device_operations STATIC
|
||||
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
|
||||
$<TARGET_OBJECTS:device_batched_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias_relu_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias2d_instance>
|
||||
$<TARGET_OBJECTS:device_reduce_instance>
|
||||
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
|
||||
$<TARGET_OBJECTS:device_grouped_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
|
||||
$<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
|
||||
$<TARGET_OBJECTS:device_conv3d_fwd_instance>
|
||||
device_conv2d.cpp
|
||||
)
|
||||
add_library(composablekernels::device_operations ALIAS device_operations)
|
||||
|
||||
|
||||
set(DEV_OPS_INC_DIRS
|
||||
${PROJECT_SOURCE_DIR}/include/ck/
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/
|
||||
${PROJECT_SOURCE_DIR}/external/include/
|
||||
)
|
||||
target_compile_features(device_operations PUBLIC)
|
||||
set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(device_operations PUBLIC
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
|
||||
)
|
||||
|
||||
#once new arches are enabled make this an option on the main cmake file
|
||||
# and pass down here to be exported
|
||||
|
||||
target_compile_options(device_operations
|
||||
PRIVATE --offload-arch=gfx908
|
||||
)
|
||||
# install(TARGETS device_operations LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_operations
|
||||
EXPORT device_operationsTargets
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
)
|
||||
install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
|
||||
install(EXPORT device_operationsTargets
|
||||
FILE composable_kerneldevice_operationsTargets.cmake
|
||||
NAMESPACE composable_kernel::
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
)
|
||||
|
||||
@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
|
||||
device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
|
||||
target_compile_features(device_batched_gemm_instance PUBLIC)
|
||||
add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
|
||||
# target_compile_features(device_batched_gemm_instance PUBLIC)
|
||||
set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
|
||||
# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_batched_gemm_instance)
|
||||
|
||||
@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
|
||||
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
|
||||
)
|
||||
|
||||
add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
|
||||
install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib)
|
||||
add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
|
||||
target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
|
||||
set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
clang_tidy_check(device_batched_gemm_reduce_instance)
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE
|
||||
device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv1d_fwd_instance PUBLIC)
|
||||
add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
|
||||
# target_compile_features(device_conv1d_fwd_instance PUBLIC)
|
||||
set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
|
||||
# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv1d_fwd_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
|
||||
device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
|
||||
add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_bwd_data_instance)
|
||||
|
||||
@@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
|
||||
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
|
||||
add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
|
||||
set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
|
||||
device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_instance)
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_instance)
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance)
|
||||
|
||||
@@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance)
|
||||
|
||||
@@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE
|
||||
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp;
|
||||
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
|
||||
)
|
||||
add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
|
||||
add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv3d_fwd_instance PUBLIC)
|
||||
set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv3d_fwd_instance)
|
||||
|
||||
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
|
||||
device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
|
||||
add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
|
||||
target_compile_features(device_convnd_bwd_data_instance PUBLIC)
|
||||
set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib)
|
||||
|
||||
201
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
Normal file
201
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
Normal file
@@ -0,0 +1,201 @@
|
||||
#include <stdlib.h>
|
||||
#include "config.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "device_operation_instance.hpp"
|
||||
#include "host_interface.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_conv2d_fwd_instance {
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
|
||||
} // namespace device_conv2d_fwd_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
|
||||
{
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
|
||||
MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads) const
|
||||
{
|
||||
return el->MakeArgumentPointer(in_ptr,
|
||||
wei_ptr,
|
||||
out_ptr,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
}
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> MakeInvokerPointer() const
|
||||
{
|
||||
return el->MakeInvokerPointer();
|
||||
}
|
||||
|
||||
std::string GetTypeString() { return el->GetTypeString(); }
|
||||
bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
|
||||
{
|
||||
return el->IsSupportedArgument(arg);
|
||||
}
|
||||
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough> el;
|
||||
};
|
||||
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
|
||||
DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default;
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
|
||||
: pImpl(std::make_unique<DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl>(std::move(other)))
|
||||
{
|
||||
}
|
||||
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
|
||||
DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads) const
|
||||
{
|
||||
return pImpl->MakeArgumentPointer(in_ptr,
|
||||
wei_ptr,
|
||||
out_ptr,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
}
|
||||
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> DeviceConvFwdPtr_t::MakeInvokerPointer() const
|
||||
{
|
||||
return pImpl->MakeInvokerPointer();
|
||||
}
|
||||
|
||||
std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
|
||||
bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
|
||||
{
|
||||
return pImpl->IsSupportedArgument(arg_ptr);
|
||||
}
|
||||
|
||||
using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
|
||||
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
|
||||
|
||||
target_compile_features(device_gemm_instance PUBLIC)
|
||||
set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_instance)
|
||||
|
||||
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias2d_instance PUBLIC)
|
||||
add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias2d_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias_relu_instance PUBLIC)
|
||||
add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias_relu_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
|
||||
add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias_relu_add_instance)
|
||||
|
||||
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
|
||||
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
|
||||
add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
|
||||
|
||||
target_compile_features(device_grouped_gemm_instance PUBLIC)
|
||||
set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
|
||||
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
|
||||
)
|
||||
|
||||
add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
target_compile_features(device_reduce_instance PUBLIC)
|
||||
add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_reduce_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_reduce_instance)
|
||||
|
||||
@@ -63,7 +63,7 @@ template <typename ADataType,
|
||||
bool profile_batched_gemm_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification,
|
||||
{
|
||||
std::string gemm_name = gemm_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
|
||||
|
||||
std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
|
||||
std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
|
||||
sizeof(CDataType) * M * N) *
|
||||
BatchCount;
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ template <typename ADataType,
|
||||
bool profile_batched_gemm_reduce_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
@@ -259,30 +259,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
|
||||
|
||||
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
// warm up
|
||||
invoker_ptr->Run(argument_ptr.get());
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
// timing
|
||||
float total_time = 0;
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
// init DO, D1 to 0
|
||||
d0_device_buf.SetZero();
|
||||
d1_device_buf.SetZero();
|
||||
|
||||
KernelTimer timer;
|
||||
|
||||
timer.Start();
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get());
|
||||
|
||||
timer.End();
|
||||
|
||||
total_time += timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
float ave_time = total_time / nrepeat;
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::string gemm_name = gemm_ptr->GetTypeString();
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ template <int NDimSpatial,
|
||||
void profile_conv_bwd_data_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = conv_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
@@ -43,7 +45,7 @@ template <int NDimSpatial,
|
||||
bool profile_conv_bwd_weight_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
|
||||
|
||||
// profile device Conv instances
|
||||
bool pass = true;
|
||||
|
||||
for(auto& conv_ptr : conv_ptrs)
|
||||
{
|
||||
// using atomic, so need to reset input
|
||||
@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
|
||||
{
|
||||
wei_device_buf.SetZero();
|
||||
}
|
||||
|
||||
auto argument_ptr = conv_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
|
||||
@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = conv_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
|
||||
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
|
||||
|
||||
float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
|
||||
|
||||
if(max_error > 8)
|
||||
{
|
||||
pass = false;
|
||||
|
||||
@@ -42,7 +42,7 @@ template <int NDimSpatial,
|
||||
void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -119,7 +119,7 @@ template <int NDimSpatial,
|
||||
void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ template <int NDimSpatial,
|
||||
void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
|
||||
@@ -269,7 +269,7 @@ template <int NDimSpatial,
|
||||
bool profile_convnd_bwd_data_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
|
||||
{
|
||||
std::string conv_name = conv_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop =
|
||||
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
|
||||
|
||||
@@ -65,7 +65,7 @@ template <typename ADataType,
|
||||
void profile_gemm_bias_2d_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
int nrepeat,
|
||||
bool time_kernel,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification,
|
||||
{
|
||||
std::string gemm_name = gemm_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user