mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
* enable gfx940 * switch between intrinsic mfma routines on mi100/200 and mi300 * fix mfma_int8 on MI300 * disable 2 int8 examples on MI300 * Update cmake-ck-dev.sh * restore gitignore file * modify Jenkinsfile to the internal repo * Bump rocm-docs-core from 0.24.0 to 0.29.0 in /docs/sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.0 to 0.29.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.0...v0.29.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * initial enablement of gfx950 * fix clang format * disable examples 31 and 41 int8 on gfx950 * add code * fix build wip * fix xx * now can build * naming * minor fix * wip fix * fix macro for exp2; fix warpgemm a/b in transposedC * unify as tuple_array * Update the required Python version to 3.9 * Update executable name in test scripts * re-structure tuple/array to avoid spill * Merge function templates * Fix format * Add constraint to array<> ctor * Re-use function * Some minor changes * remove wrong code in store_raw() * fix compile issue in transpose * Rename enum Rename 'cood_transform_enum' to 'coord_transform_enum' * let more integral_constant->constant, and formating * make sure thread_buffer can be tuple/array * temp fix buffer_store spill * not using custom data type by default, now we can have ISA-level same code as opt_padding * fix compile error, fp8 not ready now * fix fp8 duplicated move/shift/and/or problem * Default use CK_TILE_FLOAT_TO_FP8_STOCHASTIC rounding mode * fix scratch in fp8 kernel * update some readme * fix merge from upstream * sync with upstream * sync upstream again * sync 22 * remove unused * fix clang-format * update README of ck_tile example * fix several issue * let python version to be 3.8 as minimal * remove ck_tile example from default cmake target like all/install/check * remove mistake * 1).support receipe in generate.py 2).use simplified mask type 3).change left/right to pass into karg * fix some bug in group-mode masking and codegen. update README * F8 quantization for FMHA forward (#1224) * Add SAccElementFunction, PComputeElementFunction, OAccElementFunction in pipeline * Add element function to fmha api * Adjust P elementwise function * Fix bug of elementwise op, our elementwise op is not inout * Add some elementwise op, prepare to quantization * Let generate.py can generate different elementwise function * To prevent compiler issue, remove the elementwise function we have not used. * Remove f8 pipeline, we should share the same pipeline even in f8 * Remove remove_cvref_t * Avoid warning * Fix wrong fp8 QK/KV block gemm setting * Check fp8 rounding error in check_err() * Set fp8 rounding error for check_err() * Use CK_TILE_FLOAT_TO_FP8_STANDARD as default fp8 rounding mode * 1. codgen the f8 api and kernel 2. f8 host code * prevent warning in filter mode * Remove not-in-use elementwise function kargs * Remove more not-in-use elementwise function kargs * Small refinements in C++ source files * Use conditional_t<> to simplify code * Support heterogeneous argument for binary function types * Re-use already-existing scales<> functor template * Fix wrong value produced by saturating * Generalize the composes<> template * Unify saturates<> implementation * Fix type errors in composes<> * Extend less_equal<> * Reuse the existing template less_equal<> in check_err() * Add equal<float> & equal<double> * Rename check_err() parameter * Rename check_err() parameter * Add FIXME comment for adding new macro in future * Remove unnecessary cast to void * Eliminate duplicated code * Avoid dividing api pool into more than 2 groups * Use more clear variable names * Use affirmative condition in if stmt * Remove blank lines * Donot perfect forwarding in composes<> * To fix compile error, revert generate.py back to4439cc107d* Fix bug of p element function * Add compute element op to host softmax * Remove element function in api interface * Extract user parameter * Rename pscale and oscale variable * rename f8 to fp8 * rename more f8 to fp8 * Add pipeline::operator() without element_functor * 1. Remove deprecated pipeline enum 2. Refine host code parameter * Use quantization range as input * 1. Rename max_dtype to dtype_max. 2. Rename scale to scale_s 3.Add init description * Refine description * prevent early return * unify _squant kernel name in cpp, update README * Adjust the default range. * Refine error message and bias range * Add fp8 benchmark and smoke test * fix fp8 swizzle_factor=4 case --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> Co-authored-by: carlushuang <carlus.huang@amd.com> --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: illsilin <Illia.Silin@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: Jing Zhang <jizha@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Po-Yen, Chen <PoYen.Chen@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> [ROCm/composable_kernel commit:db376dd8a4]
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
import pathlib
|
|
from pathlib import Path
|
|
import subprocess
|
|
import os
|
|
import copy
|
|
|
|
NS = 'ck_tile'
|
|
OPS = 'ops'
|
|
OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
|
|
|
|
HEADER_COMMON = """// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
|
|
"""
|
|
|
|
# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
|
|
def get_module(f, level = 0):
|
|
all_parts = f.parts
|
|
return str(all_parts[level])
|
|
|
|
all_files = []
|
|
for p in sorted(Path("./").rglob("*")):
|
|
if p.suffix == '.hpp':
|
|
all_files.append(pathlib.PurePath(p))
|
|
|
|
class submodule_t:
|
|
def __init__(self):
|
|
self.m = dict()
|
|
def push(self, f):
|
|
if len(f.parents) != 1: # ignore ./xxx.hpp
|
|
mod = get_module(f)
|
|
if mod == OPS:
|
|
if mod not in self.m.keys():
|
|
self.m[mod] = dict()
|
|
mod2 = get_module(f, 1)
|
|
if Path(mod2).suffix != '.hpp':
|
|
# ignore ops/xxx.hpp
|
|
if mod2 not in self.m[mod].keys():
|
|
self.m[mod][mod2] = list()
|
|
self.m[mod][mod2].append(f)
|
|
else:
|
|
if mod not in self.m.keys():
|
|
self.m[mod] = list()
|
|
self.m[mod].append(f)
|
|
|
|
def gen(self):
|
|
def gen_header(hpath, include_list):
|
|
# print(hpath)
|
|
if os.path.exists(str(hpath)):
|
|
os.remove(str(hpath))
|
|
with hpath.open('w') as f:
|
|
f.write(HEADER_COMMON)
|
|
f.write('#pragma once\n')
|
|
f.write('\n')
|
|
for individual_header in include_list:
|
|
header_path = NS + '/' + str(individual_header)
|
|
f.write(f'#include \"{header_path}\"\n')
|
|
# f.write('\n') # otherwise clang-format will complain
|
|
# print(self.m)
|
|
# restructure common
|
|
for k, v in self.m.items():
|
|
if k == OPS and OPS_COMMON in v.keys():
|
|
common_list = copy.deepcopy(v[OPS_COMMON])
|
|
# v.pop(OPS_COMMON)
|
|
for km in v.keys():
|
|
if km != OPS_COMMON:
|
|
v[km].extend(common_list)
|
|
|
|
for k, v in self.m.items():
|
|
if k == OPS:
|
|
for km, kv in v.items():
|
|
gen_header(Path(k) / (f'{km}.hpp'), kv)
|
|
else:
|
|
gen_header(Path(f'{k}.hpp'), v)
|
|
|
|
|
|
submodule = submodule_t()
|
|
# formatting
|
|
for x in all_files:
|
|
subprocess.Popen(f'dos2unix {str(x)}', shell=True)
|
|
cmd = f'clang-format-12 -style=file -i {str(x)}'
|
|
#for xp in x.parents:
|
|
#print(get_file_base(x))
|
|
subprocess.Popen(cmd, shell=True)
|
|
submodule.push(x)
|
|
|
|
submodule.gen()
|
|
|
|
#print(all_files)
|