diff --git a/CMakeLists.txt b/CMakeLists.txt index ba57ead09a..450a050647 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,7 +294,7 @@ endif() option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF) - +option(SAVE_TEMPS "Whether save intermeidate result." OFF) if(USE_BITINT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) add_compile_options(-Wno-bit-int-extension) @@ -307,6 +307,11 @@ if(USE_OPT_GFX11) message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}") endif() +if(SAVE_TEMPS) + add_compile_options(-save-temps=obj -Wno-gnu-line-marker) +endif() + + ## Threads set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) diff --git a/include/ck_tile/ops/transpose/block_transpose.hpp b/include/ck_tile/ops/transpose/block_transpose.hpp index 29a49025b1..d24ebff6da 100644 --- a/include/ck_tile/ops/transpose/block_transpose.hpp +++ b/include/ck_tile/ops/transpose/block_transpose.hpp @@ -120,8 +120,8 @@ struct BlockTranspose { auto input_tile_window = make_tile_window(input_window, Policy::template MakeInputDistribution()); - //auto output_tile_window = - // make_tile_window(output_window, Policy::template MakeLdsLoadTileDistribution()); + auto output_tile_window = + make_tile_window(output_window, Policy::template MakeOutputDistribution()); DataType* p_lds_ptr = static_cast(p_smem); constexpr auto in_lds_block_desc = Policy::template MakeLdsStoreBlockDescriptor(); @@ -147,19 +147,11 @@ struct BlockTranspose store_tile(copy_to_lds_window, x); block_sync_lds(); - //auto y = load_tile(load_from_lds_window); - - //Debug> cccc; - // auto load_from_lds_window = - // make_tile_window(output_lds_block, - // make_tuple(number{}, - // number{}), {0, 0}, Policy::template - // MakeLdsLoadTileDistribution()); - auto y = load_tile_transpose(load_from_lds_window); - //Debug> cccc; - store_tile(output_window, y); + // auto out_tensor = make_static_distributed_tensor(Policy::template MakeOutputDistribution()); + + store_tile(output_tile_window, out_tensor); } }; diff --git a/include/ck_tile/ops/transpose/transpose_policy.hpp b/include/ck_tile/ops/transpose/transpose_policy.hpp index 4fdf1acb80..df5066511c 100644 --- a/include/ck_tile/ops/transpose/transpose_policy.hpp +++ b/include/ck_tile/ops/transpose/transpose_policy.hpp @@ -79,17 +79,18 @@ struct TransposePolicy template CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution() { - constexpr index_t BlockSize = Problem::kBlockSize; - constexpr index_t LeadDimPerBlock = Problem::kSecondSizePerBlock; - constexpr index_t SecondDimPerBlock = Problem::kLeadSizePerBlock; - constexpr index_t VecLoadSize = 16 / sizeof(typename Problem::DataType); - - using TileEncodingPattern = TileDistributionEncodingPattern2D; - return TileEncodingPattern::Make2DStaticTileDistribution(); + //constexpr index_t BlockSize = Problem::kBlockSize; + //constexpr index_t LeadDimPerBlock = Problem::kSecondSizePerBlock; + //constexpr index_t SecondDimPerBlock = Problem::kLeadSizePerBlock; + constexpr index_t VecLoadSize = 8 / sizeof(typename Problem::DataType); + //TODO, fix the tile distribution + return make_static_tile_distribution( + tile_distribution_encoding, + tuple, sequence<4, VecLoadSize>>, + tuple>, + tuple>, + sequence<2>, + sequence<1>>{}); } template