mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Enable sccache in the default docker and CI. (#1009)
* replace ccache with sccache, pin package versions
* put ccache back temporarily to avoid breaking other CI jobs
* add sccashe_wrapper.sh script
* fix the package version syntax
* fix the pymysql package issue
* run sccache_wrapper before build if ccache server found
* set the paths before calling the sccache_wrapper
* use /tmp instead of /usr/local for cache
* try using sccache --start-server instead of wrapper
* try using redis server with sccache
* define SCCACHE_REDIS
* add redis and ping packages, and redis port
* use the new sccache redis server
* do not use sccache with staging compiler
* fix the condition syntax
* add stunnel to redis
* add tunnel verification
* separate caches for different architectures
* fix syntax for the cache tag
* quse double brackets for conditions
* add bash line to the script
* add a switch for sccache and only use it in build stage
* run check_host function when enabling sccache
* fix the invocation tags for sccache
* fix groovy syntax
* set the invocation tag in groovy
* disable sccache in clang-format stage
* try another syntax for invocation tags
* use local sccache server if can't connect to redis
* fix script syntax
* update README
* refresh readme
* readme updates
* remove the timing and verification caveat from readme
---------
Co-authored-by: Lisa Delaney <lisa.delaney@amd.com>
[ROCm/composable_kernel commit: 4e44a9e8da]
This commit is contained in:
@@ -373,9 +373,10 @@ include_directories(BEFORE
|
||||
|
||||
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
|
||||
if(BUILD_DEV)
|
||||
add_compile_options(-Werror)
|
||||
add_compile_options(-Weverything)
|
||||
add_compile_options(-Werror -Weverything)
|
||||
endif()
|
||||
#add flags to reduce the size of binaries
|
||||
add_compile_options(-Oz -flto=thin)
|
||||
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
|
||||
|
||||
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
|
||||
@@ -390,35 +391,27 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
|
||||
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
|
||||
set(add_inst 0)
|
||||
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
|
||||
#message("fp8 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
|
||||
#message("bf8 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
|
||||
#message("fp16 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
|
||||
#message("fp32 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
|
||||
#message("fp64 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
|
||||
#message("bf16 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
|
||||
#message("int8 instance found!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(NOT "${cmake_instance}" MATCHES "DTYPES")
|
||||
#message("instance should be built for all types!")
|
||||
set(add_inst 1)
|
||||
endif()
|
||||
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
|
||||
|
||||
22
Dockerfile
22
Dockerfile
@@ -26,25 +26,37 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
|
||||
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
|
||||
RUN amdgpu-install -y --usecase=rocm --no-dkms
|
||||
|
||||
## Sccache binary built from source for ROCm
|
||||
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
|
||||
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
|
||||
RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
|
||||
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
|
||||
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
|
||||
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
|
||||
build-essential \
|
||||
ccache \
|
||||
cmake \
|
||||
ccache \
|
||||
git \
|
||||
hip-rocclr \
|
||||
iputils-ping \
|
||||
jq \
|
||||
libelf-dev \
|
||||
libncurses5-dev \
|
||||
libnuma-dev \
|
||||
libpthread-stubs0-dev \
|
||||
llvm-amdgpu \
|
||||
net-tools \
|
||||
pkg-config \
|
||||
python \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
redis \
|
||||
sshpass \
|
||||
stunnel \
|
||||
software-properties-common \
|
||||
vim \
|
||||
nano \
|
||||
@@ -62,7 +74,7 @@ RUN gunzip /usr/local/bin/ninja.gz
|
||||
RUN chmod a+x /usr/local/bin/ninja
|
||||
RUN git clone https://github.com/nico/ninjatracing.git
|
||||
# Update the cmake to the latest version
|
||||
RUN pip install --upgrade cmake
|
||||
RUN pip install --upgrade cmake==3.27.5
|
||||
|
||||
# Setup ubsan environment to printstacktrace
|
||||
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
|
||||
@@ -77,9 +89,9 @@ ARG PREFIX=/opt/rocm
|
||||
RUN pip3 install --upgrade pip
|
||||
RUN pip3 install sqlalchemy==1.4.46
|
||||
RUN pip3 install pymysql
|
||||
RUN pip3 install pandas
|
||||
RUN pip3 install pandas==2.0.3
|
||||
RUN pip3 install setuptools-rust
|
||||
RUN pip3 install sshtunnel
|
||||
RUN pip3 install sshtunnel==0.4.0
|
||||
# Setup ubsan environment to printstacktrace
|
||||
ENV UBSAN_OPTIONS=print_stacktrace=1
|
||||
|
||||
@@ -115,6 +127,8 @@ RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ];
|
||||
else echo "using the release compiler"; \
|
||||
fi
|
||||
|
||||
#clean-up the deb package
|
||||
RUN sh -c "rm -rf amdgpu-install*"
|
||||
|
||||
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
|
||||
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
|
||||
|
||||
95
Jenkinsfile
vendored
95
Jenkinsfile
vendored
@@ -65,10 +65,10 @@ def getDockerImageName(){
|
||||
}
|
||||
|
||||
def check_host() {
|
||||
if ("${env.CK_CCACHE}" != "null"){
|
||||
def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
|
||||
echo "ccache server: ${CCACHE_SERVER}"
|
||||
sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
|
||||
if ("${env.CK_SCCACHE}" != "null"){
|
||||
def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
|
||||
echo "sccache server: ${SCCACHE_SERVER}"
|
||||
sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
|
||||
def output = readFile(file: "tmp.txt")
|
||||
echo "tmp.txt contents: \$output"
|
||||
return (output != "0")
|
||||
@@ -96,24 +96,9 @@ def build_compiler(){
|
||||
|
||||
def getDockerImage(Map conf=[:]){
|
||||
env.DOCKER_BUILDKIT=1
|
||||
def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
|
||||
def prefixpath = conf.get("prefixpath", "/opt/rocm")
|
||||
def no_cache = conf.get("no_cache", false)
|
||||
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
|
||||
echo "ccache server: ${env.CK_CCACHE}"
|
||||
if(env.CK_CCACHE)
|
||||
{
|
||||
if(check_host())
|
||||
{
|
||||
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
|
||||
}
|
||||
else
|
||||
{
|
||||
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
|
||||
}
|
||||
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
|
||||
env.CCACHE_DIR = """/tmp/ccache_store"""
|
||||
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
|
||||
}
|
||||
if(no_cache)
|
||||
{
|
||||
dockerArgs = dockerArgs + " --no-cache "
|
||||
@@ -142,21 +127,6 @@ def buildDocker(install_prefix){
|
||||
def image_name = getDockerImageName()
|
||||
echo "Building Docker for ${image_name}"
|
||||
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
|
||||
echo "ccache server: ${env.CK_CCACHE}"
|
||||
if(env.CK_CCACHE)
|
||||
{
|
||||
if(check_host())
|
||||
{
|
||||
echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
|
||||
}
|
||||
else
|
||||
{
|
||||
echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
|
||||
}
|
||||
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
|
||||
env.CCACHE_DIR = """/tmp/ccache_store"""
|
||||
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
|
||||
}
|
||||
|
||||
echo "Build Args: ${dockerArgs}"
|
||||
try{
|
||||
@@ -219,13 +189,9 @@ def cmake_build(Map conf=[:]){
|
||||
}else{
|
||||
setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
|
||||
}
|
||||
if(env.CK_CCACHE)
|
||||
{
|
||||
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
|
||||
}
|
||||
echo "ccache server: ${env.CK_CCACHE}"
|
||||
|
||||
def pre_setup_cmd = """
|
||||
#!/bin/bash
|
||||
echo \$HSA_ENABLE_SDMA
|
||||
ulimit -c unlimited
|
||||
rm -rf build
|
||||
@@ -234,6 +200,46 @@ def cmake_build(Map conf=[:]){
|
||||
mkdir install
|
||||
cd build
|
||||
"""
|
||||
def invocation_tag=""
|
||||
if (setup_args.contains("gfx11")){
|
||||
invocation_tag="gfx11"
|
||||
}
|
||||
if (setup_args.contains("gfx10")){
|
||||
invocation_tag="gfx10"
|
||||
}
|
||||
if (setup_args.contains("gfx90")){
|
||||
invocation_tag="gfx90"
|
||||
}
|
||||
if (setup_args.contains("gfx94")){
|
||||
invocation_tag="gfx94"
|
||||
}
|
||||
if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
|
||||
pre_setup_cmd = pre_setup_cmd + """
|
||||
#!/bin/bash
|
||||
export ROCM_PATH=/opt/rocm
|
||||
export SCCACHE_ENABLED=true
|
||||
export SCCACHE_LOG_LEVEL=debug
|
||||
export SCCACHE_IDLE_TIMEOUT=14400
|
||||
export COMPILERS_HASH_DIR=/tmp/.sccache
|
||||
export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
|
||||
export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
|
||||
export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
|
||||
echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
|
||||
export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
|
||||
echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
|
||||
stunnel ../script/redis-cli.conf
|
||||
(
|
||||
set -e
|
||||
../script/sccache_wrapper.sh --enforce_redis
|
||||
)
|
||||
error_code=\$?
|
||||
if [ \$error_code -ne 0 ]; then
|
||||
echo "could not connect to the redis server. using sccache locally."
|
||||
../script/sccache_wrapper.sh
|
||||
fi
|
||||
"""
|
||||
setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
|
||||
}
|
||||
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
|
||||
// reduce parallelism when compiling, clang uses too much memory
|
||||
def nt = nthreads()
|
||||
@@ -251,7 +257,7 @@ def cmake_build(Map conf=[:]){
|
||||
sh cmd
|
||||
|
||||
// Only archive from master or develop
|
||||
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
|
||||
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
|
||||
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
|
||||
}
|
||||
}
|
||||
@@ -635,7 +641,7 @@ def process_results(Map conf=[:]){
|
||||
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
|
||||
0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
|
||||
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
|
||||
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
|
||||
|
||||
pipeline {
|
||||
agent none
|
||||
@@ -682,7 +688,10 @@ pipeline {
|
||||
name: 'hipTensor_branch',
|
||||
defaultValue: 'mainline',
|
||||
description: 'Specify which branch of hipTensor to use (default: mainline)')
|
||||
|
||||
booleanParam(
|
||||
name: "USE_SCCACHE",
|
||||
defaultValue: true,
|
||||
description: "Use the sccache for building CK (default: ON)")
|
||||
}
|
||||
environment{
|
||||
dbuser = "${dbuser}"
|
||||
|
||||
229
README.md
229
README.md
@@ -1,139 +1,189 @@
|
||||
# Composable Kernel
|
||||
|
||||
## Methodology
|
||||
The Composable Kernel (CK) library provides a programming model for writing performance-critical
|
||||
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
|
||||
uses general purpose kernel languages, such as HIP C++.
|
||||
|
||||
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
|
||||
CK uses two concepts to achieve performance portability and code maintainability:
|
||||
|
||||
CK utilizes two concepts to achieve performance portability and code maintainability:
|
||||
* A tile-based programming model
|
||||
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
|
||||
* Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
|
||||
technique called *Tensor Coordinate Transformation*.
|
||||
|
||||

|
||||
|
||||
## Code Structure
|
||||
The current CK library is structured into four layers:
|
||||
|
||||
Current CK library are structured into 4 layers:
|
||||
* "Templated Tile Operators" layer
|
||||
* "Templated Kernel and Invoker" layer
|
||||
* "Instantiated Kernel and Invoker" layer
|
||||
* "Client API" layer
|
||||
* Templated Tile Operators
|
||||
* Templated Kernel and Invoker
|
||||
* Instantiated Kernel and Invoker
|
||||
* Client API
|
||||
|
||||

|
||||
|
||||
## Documentation
|
||||
## General information
|
||||
|
||||
Run the steps below to build documentation locally.
|
||||
To build our documentation locally, use the following code:
|
||||
|
||||
```
|
||||
``` bash
|
||||
cd docs
|
||||
pip3 install -r sphinx/requirements.txt
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
## Contributors
|
||||
You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
|
||||
page.
|
||||
|
||||
The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
|
||||
```note
|
||||
If you use CK, cite us as follows:
|
||||
|
||||
## Citation
|
||||
|
||||
If you use CK, please use following citations:
|
||||
* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
|
||||
* [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???):
|
||||
This paper will be available on arXiv soon.
|
||||
* [CITATION.cff](/CITATION.cff)
|
||||
|
||||
## License
|
||||
|
||||
CK is released under the MIT license. [License File](/LICENSE)
|
||||
|
||||
|
||||
# Build CK
|
||||
|
||||
## Build docker image
|
||||
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
|
||||
```
|
||||
Pre-built dockers are available from this public repo:
|
||||
https://hub.docker.com/r/rocm/composable_kernel/tags
|
||||
|
||||
## Launch docker
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
ck:latest \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build CK
|
||||
CK is released under the **[MIT license](/LICENSE)**.
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
## Building CK
|
||||
|
||||
# Need to specify target ID, example below is for gfx908 and gfx90a
|
||||
We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
|
||||
Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
|
||||
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D GPU_TARGETS="gfx908;gfx90a" \
|
||||
..
|
||||
```
|
||||
1. To build a new Docker image, use the Dockerfile provided with the source code:
|
||||
|
||||
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the
|
||||
current compiler.
|
||||
```bash
|
||||
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
|
||||
```
|
||||
|
||||
2. Launch the Docker container:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
ck:latest \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
3. Clone CK source code from the GitHub repository and start the build:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
|
||||
cd composable_kernel && \
|
||||
mkdir build && \
|
||||
cd build
|
||||
```
|
||||
|
||||
You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
|
||||
to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
|
||||
use a semicolon between each; for example, `gfx908;gfx90a;gfx940`.
|
||||
|
||||
```bash
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D GPU_TARGETS="gfx908;gfx90a" \
|
||||
..
|
||||
```
|
||||
|
||||
If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
|
||||
supported by the current compiler (this may take a long time).
|
||||
|
||||
4. Build the entire CK library:
|
||||
|
||||
```bash
|
||||
make -j
|
||||
```
|
||||
|
||||
5. Install CK:
|
||||
|
||||
```bash
|
||||
make -j install
|
||||
```
|
||||
|
||||
## Optional post-install steps
|
||||
|
||||
* Build examples and tests:
|
||||
|
||||
```bash
|
||||
make -j examples tests
|
||||
```
|
||||
|
||||
* Build and run all examples and tests:
|
||||
|
||||
```bash
|
||||
make -j check
|
||||
```
|
||||
|
||||
You can find instructions for running each individual example in [example](/example).
|
||||
|
||||
* Build ckProfiler:
|
||||
|
||||
```bash
|
||||
make -j ckProfiler
|
||||
```
|
||||
|
||||
You can find instructions for running ckProfiler in [profiler](/profiler).
|
||||
|
||||
Note the `-j` option for building with multiple threads in parallel. This speeds up the build significantly.
|
||||
Depending on the number of CPU cores and the amount of RAM on your system, you may want to
|
||||
limit the number of threads. For example, if you have a 128-core CPU and 64 Gb of RAM.
|
||||
|
||||
By default, `-j` launches one thread per CPU core, which can cause the build to run out of memory and
|
||||
crash. In such cases, you can reduce the number of threads to 32 by using `-j32`.
|
||||
|
||||
Additional cmake flags can be used to significantly speed-up the build:
|
||||
|
||||
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
|
||||
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
|
||||
* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library
|
||||
while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a
|
||||
dependency and don't plan to run any examples or tests.
|
||||
|
||||
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances
|
||||
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
|
||||
* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
|
||||
instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
|
||||
other data types.
|
||||
|
||||
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl
|
||||
instances. Those instances are only needed for the NAVI2x platforms.
|
||||
* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or
|
||||
`batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
|
||||
other platforms have faster instances, such as `xdl` or `wmma`, available.
|
||||
|
||||
### Build examples and tests
|
||||
## Using sccache for building
|
||||
|
||||
The default CK Docker images come with a pre-installed version of sccache, which supports clang
|
||||
being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
|
||||
hours to 1-2 minutes. In order to invoke sccache, you need to run:
|
||||
|
||||
```bash
|
||||
make -j examples tests
|
||||
make test
|
||||
sccache --start-server
|
||||
```
|
||||
|
||||
Instructions for running each individual examples are under [example](/example)
|
||||
|
||||
|
||||
## Build ckProfiler
|
||||
then add the following flags to the cmake command line:
|
||||
|
||||
```bash
|
||||
make -j ckProfiler
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
|
||||
```
|
||||
Instructions for running ckProfiler are under [profiler](/profiler)
|
||||
|
||||
## Install CK
|
||||
|
||||
```bash
|
||||
make install
|
||||
```
|
||||
You may need to clean up the build folder and repeat the cmake and make steps in order to take
|
||||
advantage of the sccache during subsequent builds.
|
||||
|
||||
## Using CK as pre-built kernel library
|
||||
|
||||
Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
|
||||
You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
|
||||
|
||||
## Contributing
|
||||
## Contributing to CK
|
||||
|
||||
When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
|
||||
When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
|
||||
recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
|
||||
|
||||
```bash
|
||||
sudo script/install_precommit.sh
|
||||
```
|
||||
|
||||
This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
|
||||
With this approach, `pre-commit` adds the appropriate hooks to your local repository and
|
||||
automatically runs `clang-format` (and possibly additional checks) before any commit is created.
|
||||
|
||||
If you need to uninstall hooks from the repository, you can do so by running the following command:
|
||||
|
||||
@@ -141,14 +191,5 @@ If you need to uninstall hooks from the repository, you can do so by running the
|
||||
script/uninstall_precommit.sh
|
||||
```
|
||||
|
||||
If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
|
||||
|
||||
## Caveat
|
||||
### Kernel Timing and Verification
|
||||
|
||||
CK's own kernel timer will warn up kernel once, and then run it multiple times
|
||||
to get average kernel time. For some kernels that use atomic add, this will cause
|
||||
output buffer to be accumulated multiple times, causing verification failure.
|
||||
To work around it, do not use CK's own timer and do verification at the same time.
|
||||
CK's own timer and verification in each example and ckProfiler can be enabled or
|
||||
disabled from command line.
|
||||
If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
|
||||
`git commit` command.
|
||||
|
||||
10
script/redis-cli.conf
Normal file
10
script/redis-cli.conf
Normal file
@@ -0,0 +1,10 @@
|
||||
fips = no
|
||||
setuid = root
|
||||
setgid = root
|
||||
pid = /var/run/stunnel.pid
|
||||
debug = 7
|
||||
options = NO_SSLv2
|
||||
options = NO_SSLv3
|
||||
[redis-cli]
|
||||
client = yes
|
||||
accept = 127.0.0.1:6379
|
||||
56
script/sccache_wrapper.sh
Executable file
56
script/sccache_wrapper.sh
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
COMPILERS_HASH_DIR=${COMPILERS_HASH_DIR:-"/tmp/.sccache"}
|
||||
SCCACHE_EXTRAFILES=${SCCACHE_EXTRAFILES:-"${COMPILERS_HASH_DIR}/rocm_compilers_hash_file"}
|
||||
SCCACHE_BIN=${SCCACHE_BIN:-"${SCCACHE_INSTALL_LOCATION}/sccache"}
|
||||
ENFORCE_REDIS="false"
|
||||
while [ "$1" != "" ];
|
||||
do
|
||||
case $1 in
|
||||
--enforce_redis )
|
||||
shift; ENFORCE_REDIS="true" ;;
|
||||
--no-hipcc )
|
||||
shift ;;
|
||||
*)
|
||||
break ;;
|
||||
esac
|
||||
done
|
||||
setup_rocm_compilers_hash_file() {
|
||||
mkdir -p "$COMPILERS_HASH_DIR"
|
||||
HIPCC_MD5="$(md5sum "${ROCM_PATH}/bin/hipcc")"
|
||||
pushd "${ROCM_PATH}/amdgcn/bitcode"
|
||||
DEVICELIBS_BITCODES_MD5="$(find . -type f -exec md5sum {} \; | sort | md5sum)"
|
||||
popd
|
||||
HIPCC_HASH_VALUE="${HIPCC_MD5%% *}"
|
||||
DEVICELIBS_BITCODES_HASH_VALUE="${DEVICELIBS_BITCODES_MD5%% *}"
|
||||
# MD5 checksums of clang and clang-offload-bundler cannot be used since they will keep changing
|
||||
# if the ROCM_PATH changes, ie; for every mainline build.
|
||||
# This is because ROCM_PATH gets encoded into the clang/clang-offload-bundler binaries as part
|
||||
# of RPATH.
|
||||
# The versions themselves contain the commit hash of the compiler repo at the time of building.
|
||||
# Hence, this should be a viable alternative to using the binary checksum itself.
|
||||
CLANG_VERSION="$("${ROCM_PATH}/llvm/bin/clang" --version | head -n 1)"
|
||||
CLANG_OFFLOAD_BUNDLER_VERSION="$("${ROCM_PATH}/llvm/bin/clang-offload-bundler" --version | head -n 1)"
|
||||
printf '%s: %s\n' 'clang version' "${CLANG_VERSION}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'clang-offload-bundler version' "${CLANG_OFFLOAD_BUNDLER_VERSION}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'hipcc md5sum' "${HIPCC_HASH_VALUE}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'devicelibs bitcode md5sum' "${DEVICELIBS_BITCODES_HASH_VALUE}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
echo "sccache-wrapper: compilers hash file set up at ${SCCACHE_EXTRAFILES}"
|
||||
cat "$SCCACHE_EXTRAFILES"
|
||||
}
|
||||
if [ "${ENFORCE_REDIS}" == "true" ]; then
|
||||
if [ -z "${SCCACHE_REDIS}" ]; then
|
||||
echo "SCCACHE_REDIS not set. Not wrapping compilers with sccache."
|
||||
exit 10
|
||||
else
|
||||
response=$(redis-cli -u ${SCCACHE_REDIS} ping) || true
|
||||
if [ "${response}" != "PONG" ]; then
|
||||
echo "Redis server unreachable. Not wrapping compilers with sccache."
|
||||
exit 20
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
setup_rocm_compilers_hash_file
|
||||
$SCCACHE_BIN --version
|
||||
$SCCACHE_BIN --start-server
|
||||
|
||||
Reference in New Issue
Block a user