mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com>
49 lines
2.8 KiB
Bash
Executable File
49 lines
2.8 KiB
Bash
Executable File
#!/bin/bash
|
|
rm -f CMakeCache.txt
|
|
rm -f *.cmake
|
|
rm -rf CMakeFiles
|
|
|
|
MY_PROJECT_SOURCE=../../../
|
|
MY_PROJECT_INSTALL=../install.dir
|
|
|
|
cmake \
|
|
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
|
|
-D CMAKE_BUILD_TYPE=Release \
|
|
-D DEVICE_BACKEND=AMD \
|
|
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$CWD" \
|
|
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
|
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
|
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
|
${MY_PROJECT_SOURCE}
|
|
|
|
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
|
|
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
|
|
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
|
|
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
|
|
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
|
|
|
|
#CXX_FLAG_TMP=-Weverything
|
|
# -Wno-c++98-compat \
|
|
# -Wno-c++98-compat-pedantic \
|
|
# -Wno-conversion \
|
|
# -Wno-double-promotion \
|
|
# -Wno-exit-time-destructors \
|
|
# -Wno-extra-semi \
|
|
# -Wno-float-conversion \
|
|
# -Wno-gnu-anonymous-struct \
|
|
# -Wno-gnu-zero-variadic-macro-arguments \
|
|
# -Wno-missing-noreturn \
|
|
# -Wno-missing-prototypes \
|
|
# -Wno-nested-anon-types \
|
|
# -Wno-padded \
|
|
# -Wno-return-std-move-in-c++11 \
|
|
# -Wno-shorten-64-to-32 \
|
|
# -Wno-sign-conversion \
|
|
# -Wno-unknown-warning-option \
|
|
# -Wno-unused-command-line-argument \
|
|
# -Wno-weak-vtables \
|
|
# -Wno-covered-switch-default \
|
|
# -Wno-disabled-macro-expansion \
|
|
# -Wno-undefined-reinterpret-cast
|
|
|