Universal streamk with atomics (#1360)

* universal streamk with atomics with ckprofiler support. grid_size and streamk strategy are tunable. grid_size of -1 leads to #WGs = maximum occupancy X num_CUs. implementation supports many different streamk policies: 1-tile, 2-tile, 3-tile and 4-tile. streamk strategy of -1 leads to default streamk policy (4-tile). 

* Update README.md

* fixing clang-format issues

* removed conflicts in struct members between streamk and universal streamk

* corrected arg parsing for streamk and universal streamk

* added stream-k policies for 3 tile and 4 tile

* fixed argument type issue with parsing cmd args

* changes suggested in PR review are made- removing comments and correcting copyright

* file permissions updated

* added default value support for grid_size and streamk-policy selection set to -1

* print messages for arguments

* print messages for arguments

* print messages for arguments1
This commit is contained in:
Harisankar Sadasivan
2024-07-05 21:40:30 -07:00
committed by GitHub
parent eaa870a1ab
commit 75e622f02f
61 changed files with 5846 additions and 2 deletions

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -45,6 +45,19 @@ struct ProblemSizeStreamK final
ck::index_t NumSKBlocks = -1;
};
struct ProblemSizeStreamK_universal final
{
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
};
struct ProblemSizeSplitK final
{
@@ -123,6 +136,57 @@ bool parse_cmd_args<ProblemSize>(int argc,
return true;
}
template <>
bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
char* argv[],
ProblemSizeStreamK_universal& problem_size,
ExecutionConfig& config)
{
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc >= 10)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.M = std::stoi(argv[4]);
problem_size.N = std::stoi(argv[5]);
problem_size.K = std::stoi(argv[6]);
problem_size.StrideA = std::stoi(argv[7]);
problem_size.StrideB = std::stoi(argv[8]);
problem_size.StrideC = std::stoi(argv[9]);
if(argc >= 11)
{
problem_size.Streamk_sel = std::stoi(argv[10]);
problem_size.Grid_size = std::stoi(argv[11]);
}
}
else
{
std::cerr
<< "arg1: verification (0=no, 1=yes)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
return false;
}
return true;
}
template <>
bool parse_cmd_args<ProblemSizeStreamK>(int argc,
char* argv[],
@@ -165,7 +229,8 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: NumSKBlocks(optional)" << std::endl;
<< "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
return false;
}