diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index 38e967e8d0..9bfb153d14 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -11,6 +11,7 @@ #include #include "ck/host_utility/device_prop.hpp" +#include "ck_tile/host/timer.hpp" enum class Metric { @@ -136,6 +137,42 @@ class GemmProfiler return "Unknown"; } + template + static float execute_kernel(const ck_tile::stream_config& stream, + const std::function& kernel_launch) + { + Timer timer; + + auto flush_cache = [&] { +#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ + __builtin_amdgcn_s_dcache_wb(); + __builtin_amdgcn_s_dcache_inv(); +#endif + }; + + // Cold iterations + for(int i = 0; i < stream.cold_niters_; ++i) + { + timer.start(stream.stream_id_); + kernel_launch(); + timer.stop(stream.stream_id_); + } + + // Warm iterations with measurement + std::vector measured_times; + for(int i = 0; i < stream.nrepeat_; ++i) + { + timer.start(stream.stream_id_); + kernel_launch(); + timer.stop(stream.stream_id_); + measured_times.push_back(timer.duration()); + flush_cache(); + } + + return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) / + measured_times.size(); + } + template void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf, ck_tile::HostTensor& c_m_n_host_result, @@ -163,22 +200,30 @@ class GemmProfiler KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}}; - float avg_time = Kernel::launch(args, stream); - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + auto kernel_launch = [&] { Kernel::launch(args, stream); }; + + float avg_time = 0.f; + if(stream.is_gpu_timer_) + { + avg_time = execute_kernel(stream, kernel_launch); + } + else + { + avg_time = execute_kernel(stream, kernel_launch); + } std::size_t flop = std::size_t(2) * args.M * args.N * args.K; std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + sizeof(CDataType) * args.M * args.N; - float tflops = static_cast(flop) / 1.E9 / avg_time; - float gb_per_sec = num_byte / 1.E6 / avg_time; kernel_instance.perf_result.latency = avg_time; - kernel_instance.perf_result.tflops = tflops; - kernel_instance.perf_result.bandwidth = gb_per_sec; + kernel_instance.perf_result.tflops = static_cast(flop) / 1.E9 / avg_time; + kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time; std::cout << kernel_instance << std::endl; + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool verified_correct = !verify || compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result); diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py index 0fb4f520fe..03617a7f59 100644 --- a/tile_engine/ops/gemm/codegen_utils.py +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -213,14 +213,14 @@ def get_gpu_name_by_id(gpu_id: int = 0) -> str: check=True ) - arch_pattern = r'gfx\d{3}[a-z]?' + arch_pattern = r'gfx\d{3,4}[a-z]?' match = re.search(arch_pattern, result.stdout.lower()) return match.group() if match else "" except (FileNotFoundError, subprocess.CalledProcessError) as e: - print(f"[System Error] {str(e)}") + print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}") return "" except Exception as e: - print(f"[Runtime Exception] {str(e)}") + print(f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}") return "" diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp index 96049def6f..673ba63f94 100755 --- a/tile_engine/ops/gemm/gemm_host_api.cpp +++ b/tile_engine/ops/gemm/gemm_host_api.cpp @@ -173,7 +173,7 @@ void run(const ck_tile::ArgParser& arg_parser) structured_sparsity, trait, gemm_args, - ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat}); return; } diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index 598f01b603..bc3ba7e81a 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -127,7 +127,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]}; def _generate_all_trait_files(self): """Generate all kernel traits into files.""" - if not self.all_trait_names: # Check if the list is empty + if not self.all_trait_names: self._generate_all_traits() for trait in self.all_trait_names: self._generate_trait_file(trait) @@ -177,7 +177,7 @@ struct GemmKernel {{ static constexpr bool kPadN = {pad_n}; static constexpr bool kPadK = {pad_k}; - static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{ + static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ static constexpr bool permuteA = false; static constexpr bool permuteB = false; static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"}; @@ -249,7 +249,7 @@ struct GemmKernel {{ throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!"); }} - if(s.log_level_ > 0) + if(stream.log_level_ > 0) {{ std::cout << "Launching kernel with args:" << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}" @@ -257,7 +257,7 @@ struct GemmKernel {{ << std::endl; }} - ave_time = ck_tile::launch_kernel(s, + ave_time = ck_tile::launch_kernel(stream, ck_tile::make_kernel( Kernel{{}}, grids, blocks, 0, kargs)); return ave_time; @@ -423,7 +423,7 @@ struct GemmDispatcher { ck_tile::HostTensor&, int, ck_tile::GemmHostArgs&, - const ck_tile::stream_config&)>> + const ck_tile::stream_config& stream)>> kernel_map; return kernel_map; }