[CK_TILE] Improve RMS/Layer Normalization 2 Pass Pipeline Performance (#1861)

* 50ms -> 28ms

* Fix bug in non fuse_add_store cases

* Fine tuned setting for 2 pass pipeline

* adjust workload

* remove unnecessary change

* add layernorm

* Adding output quant and unquant results at the same time.

* fix test

* fix format

* tune for cases 128x640 and 128x1024

* bug ifx
This commit is contained in:
ruanjm
2025-03-25 20:09:45 +08:00
committed by GitHub
parent d2eab23958
commit d49abdaa87
15 changed files with 492 additions and 135 deletions

View File

@@ -564,9 +564,9 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, True, False, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, True, False, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, True, False, 0, 0, 0)],
'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, True, 0, 0, 0),
'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1,1024, 8, True, False, True, True, True, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, True, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, True, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 12, 1, 256, 2, True, False, True, True, True, 0, 0, 0),
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, True, 0, 0, 0)]}
total_blob = list()
for hs_key in h_trait_dict: