diff --git a/script/analyze_build/README.md b/script/analyze_build/README.md
index 582a01f91d..4a1bcca50c 100644
--- a/script/analyze_build/README.md
+++ b/script/analyze_build/README.md
@@ -6,10 +6,17 @@ Simple, fast tools for analyzing Clang `-ftime-trace` build performance data.
This directory provides straightforward Python tools for analyzing the JSON trace files generated during compilation with `-ftime-trace`. The focus is on simplicity and speed - no caching, no complexity, just fast parallel I/O and pandas DataFrames.
-**Key principle: Fresh analysis every time is faster and simpler than managing caches.**
-
## Quick Start
+Configure a build directory `build-trace` and edit CMakeCache to add `-ftime-trace` to the `CMAKE_CXX_FLAGS`. With `-ftime-trace` enabled, the clang compiler will generate `.json` trace files alongside each compiled object file. These trace files contain detailed timing information about:
+
+- Template instantiations
+- Function parsing
+- Code generation phases
+- Optimization passes
+
+These JSON files are what the analysis tools in `script/analyze_build/` are designed to process.
+
```bash
# Analyze all trace files in a directory
cd script/analyze_build/examples
@@ -57,6 +64,7 @@ python examples/analyze_build.py ../../build-trace
```
This will:
+
- Find all `.json` files recursively
- Process them in parallel using all CPU cores
- Display comprehensive build statistics
@@ -93,6 +101,7 @@ print(f"Template time: {templates_df['dur'].sum() / 1e6:.2f}s")
For interactive analysis, see the comprehensive example notebook:
**[notebooks/comprehensive_example.ipynb](notebooks/comprehensive_example.ipynb)** - Complete guide covering:
+
- Single file analysis with detailed explanations
- Multi-file parallel processing
- Build-wide statistics and template analysis
@@ -130,11 +139,13 @@ print(event_totals.head(10))
## Performance
**Typical performance on 4,484 trace files (~46 GB):**
+
- Parsing: ~26 seconds (174 files/sec)
- Memory: ~1-2 GB
- Throughput: I/O limited (uses all CPU cores)
**Why no caching?**
+
- Fresh analysis is faster than cache management overhead
- Simpler code (60% less code than cached version)
- No cache invalidation issues
@@ -164,6 +175,7 @@ The trace files use the [Chrome Trace Event Format](https://docs.google.com/docu
```
**Key fields:**
+
- `name`: Event type (e.g., "InstantiateClass", "ParseFunctionDefinition")
- `dur`: Duration in microseconds
- `ts`: Timestamp in microseconds
@@ -248,7 +260,7 @@ template_time = templates_df['dur'].sum()
print(f"Template time: {(template_time / total_time) * 100:.1f}%")
```
-## Tips
+## Build time analysis philosophy
- **Use all CPU cores**: The tools automatically use all available cores for parallel processing
- **Memory is cheap**: 1-2GB for 4,484 files is acceptable on modern systems
diff --git a/script/analyze_build/notebooks/comprehensive_example.ipynb b/script/analyze_build/notebooks/comprehensive_example.ipynb
index a48ac0fbbb..491e373f07 100644
--- a/script/analyze_build/notebooks/comprehensive_example.ipynb
+++ b/script/analyze_build/notebooks/comprehensive_example.ipynb
@@ -40,9 +40,19 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: Install plotly for visualizations: pip install plotly\n",
+ "Using 384 CPU cores for parallel processing\n",
+ "Pandas version: 2.3.3\n"
+ ]
+ }
+ ],
"source": [
"from importlib.util import find_spec\n",
"from multiprocessing import cpu_count\n",
@@ -88,9 +98,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found 1,253 trace files\n",
+ "\n",
+ "Using sample file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n",
+ "File size: 11652.5 KB\n"
+ ]
+ }
+ ],
"source": [
"# Configure the path to your trace files\n",
"TRACE_DIR = Path(\"../../../build-trace\")\n",
@@ -127,9 +148,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parsed 15,110 events in 0.043s\n",
+ "Transformed to Pandas tables in 1.115s\n",
+ "\n",
+ "Pandas DataFrames:\n",
+ " templates : 8,703 rows, 8.10 MB | template_id, template_name, full_signature, depth, arg_count\n",
+ " instantiations : 9,838 rows, 0.31 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n",
+ " template_args : 51,474 rows, 8.73 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n",
+ " events : 15,110 rows, 0.53 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n"
+ ]
+ }
+ ],
"source": [
"if sample_files:\n",
" # Parse the trace file\n",
@@ -172,9 +208,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Compilation Unit Summary:\n",
+ " Trace file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n",
+ " Trace file size: 11652.5 KB\n",
+ " Start time: 2026-01-03 22:51:51.980489\n",
+ " Total compilation time: 178.01s\n",
+ " Total events: 15,110\n"
+ ]
+ }
+ ],
"source": [
"if sample_files:\n",
" print(\"Compilation Unit Summary:\")\n",
@@ -196,9 +245,187 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Templates DataFrame Schema:\n",
+ "Column Type Memory (MB) % of Total\n",
+ "-------------------------------------------------------------------\n",
+ "template_id int32 0.03 0.4%\n",
+ "template_name category 0.03 0.4%\n",
+ "full_signature object 8.02 99.0%\n",
+ "depth int8 0.01 0.1%\n",
+ "arg_count int8 0.01 0.1%\n",
+ "Index RangeIndex 0.00 0.0%\n",
+ "-------------------------------------------------------------------\n",
+ "TOTAL 8.10 100.0%\n",
+ "\n",
+ "Total templates: 8,703\n",
+ "CK templates: 6,331 (72.7%)\n",
+ "Other templates: 2,372\n",
+ "\n",
+ "Sample CK templates:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " template_id | \n",
+ " template_name | \n",
+ " full_signature | \n",
+ " depth | \n",
+ " arg_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 45 | \n",
+ " 45 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<ck::integral_constant<int, 2>, _BitInt(4)> | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 46 | \n",
+ " 46 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<ck::integral_constant<int, 16>, _BitInt(6)> | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 47 | \n",
+ " 47 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<float, float> | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 48 | \n",
+ " 48 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 49 | \n",
+ " 49 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 2> | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " 50 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<float, float, float, float> | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " 51 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " 52 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 4> | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " 53 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<float, float, float, float, float, float, float, float> | \n",
+ " 1 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " 54 | \n",
+ " ck::detail::TupleElementKeyData | \n",
+ " ck::detail::TupleElementKeyData<ck::detail::TupleElementKey<3>, float __attr... | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " template_id template_name \\\n",
+ "45 45 ck::Tuple \n",
+ "46 46 ck::Tuple \n",
+ "47 47 ck::Tuple \n",
+ "48 48 ck::vector_type \n",
+ "49 49 ck::vector_type \n",
+ "50 50 ck::Tuple \n",
+ "51 51 ck::vector_type \n",
+ "52 52 ck::vector_type \n",
+ "53 53 ck::Tuple \n",
+ "54 54 ck::detail::TupleElementKeyData \n",
+ "\n",
+ " full_signature \\\n",
+ "45 ck::Tuple, _BitInt(4)> \n",
+ "46 ck::Tuple, _BitInt(6)> \n",
+ "47 ck::Tuple \n",
+ "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
+ "49 ck::vector_type \n",
+ "50 ck::Tuple \n",
+ "51 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
+ "52 ck::vector_type \n",
+ "53 ck::Tuple \n",
+ "54 ck::detail::TupleElementKeyData, float __attr... \n",
+ "\n",
+ " depth arg_count \n",
+ "45 2 2 \n",
+ "46 2 2 \n",
+ "47 1 2 \n",
+ "48 1 2 \n",
+ "49 1 2 \n",
+ "50 1 4 \n",
+ "51 1 2 \n",
+ "52 1 2 \n",
+ "53 1 8 \n",
+ "54 2 2 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if sample_files:\n",
" templates_df = tables[\"templates\"]\n",
@@ -250,9 +477,166 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Template Instantiation Summary:\n",
+ " Unique templates: 8,703\n",
+ " Total instantiations: 9,838\n",
+ " Template time: 39.65s\n",
+ " Percentage of build: 22.3%\n",
+ " Avg per instantiation: 4.03 ms\n",
+ "\n",
+ "Top 10 Templates by Total Time:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " signature | \n",
+ " count | \n",
+ " total_ms | \n",
+ " mean_ms | \n",
+ " depth | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 670 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 12 | \n",
+ " 444.005 | \n",
+ " 37.000417 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 669 | \n",
+ " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
+ " 12 | \n",
+ " 442.665 | \n",
+ " 36.888750 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1411 | \n",
+ " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10>::operator()<(lamb... | \n",
+ " 34 | \n",
+ " 365.628 | \n",
+ " 10.753765 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6424 | \n",
+ " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>::operator... | \n",
+ " 2 | \n",
+ " 346.742 | \n",
+ " 173.371000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4079 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 4 | \n",
+ " 330.928 | \n",
+ " 82.732000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4078 | \n",
+ " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
+ " 4 | \n",
+ " 330.380 | \n",
+ " 82.595000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4859 | \n",
+ " ck::transform_tensor_descriptor<ck::TensorDescriptor<ck::Tuple<ck::UnMerge<c... | \n",
+ " 1 | \n",
+ " 327.756 | \n",
+ " 327.756000 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 1470 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 6 | \n",
+ " 311.490 | \n",
+ " 51.915000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1469 | \n",
+ " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
+ " 6 | \n",
+ " 310.729 | \n",
+ " 51.788167 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2362 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 5 | \n",
+ " 295.890 | \n",
+ " 59.178000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " signature \\\n",
+ "670 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n",
+ "669 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n",
+ "1411 ck::detail::applier::operator()<(lamb... \n",
+ "6424 ck::detail::applier::operator... \n",
+ "4079 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n",
+ "4078 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n",
+ "4859 ck::transform_tensor_descriptor 0:\n",
" # Join instantiations with templates\n",
@@ -316,9 +700,112 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Template Instantiation by Nesting Depth:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " depth | \n",
+ " count | \n",
+ " total_ms | \n",
+ " mean_ms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 1628 | \n",
+ " 8510.435 | \n",
+ " 5.227540 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1962 | \n",
+ " 5433.074 | \n",
+ " 2.769151 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1311 | \n",
+ " 5042.420 | \n",
+ " 3.846240 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 2548 | \n",
+ " 7730.989 | \n",
+ " 3.034140 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 2112 | \n",
+ " 10514.270 | \n",
+ " 4.978348 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 257 | \n",
+ " 2398.921 | \n",
+ " 9.334323 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 7 | \n",
+ " 20 | \n",
+ " 21.944 | \n",
+ " 1.097200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " depth count total_ms mean_ms\n",
+ "0 1 1628 8510.435 5.227540\n",
+ "1 2 1962 5433.074 2.769151\n",
+ "2 3 1311 5042.420 3.846240\n",
+ "3 4 2548 7730.989 3.034140\n",
+ "4 5 2112 10514.270 4.978348\n",
+ "5 6 257 2398.921 9.334323\n",
+ "6 7 20 21.944 1.097200"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if sample_files and len(tables[\"templates\"]) > 0:\n",
" depth_stats = (\n",
@@ -347,9 +834,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parallel processing function defined\n"
+ ]
+ }
+ ],
"source": [
"from concurrent.futures import ProcessPoolExecutor, as_completed\n",
"\n",
@@ -380,9 +875,18 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found 1,253 trace files\n",
+ "Total size: 18.79 GB\n"
+ ]
+ }
+ ],
"source": [
"# Find all trace files\n",
"json_files = find_trace_files(TRACE_DIR)\n",
@@ -408,9 +912,47 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Processing 1,253 files with 384 workers...\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/AMD/jshumway/composable_kernel/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "Processing: 100%|██████████| 1253/1253 [00:46<00:00, 27.13files/s] \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Parsing complete in 48.54s (25.8 files/sec)\n",
+ "\n",
+ "Combining results...\n",
+ "Combined in 4.73s\n",
+ "\n",
+ "Total analysis time: 53.27s\n",
+ "\n",
+ "Combined Tables:\n",
+ " Templates: 11,751,350 rows\n",
+ " Instantiations: 17,437,000 rows\n",
+ " Template Args: 155,281,743 rows\n",
+ " Events: 24,229,252 rows\n",
+ " Total memory: 34.06 GB\n"
+ ]
+ }
+ ],
"source": [
"if json_files:\n",
" print(f\"Processing {len(json_files):,} files with {cpu_count()} workers...\\n\")\n",
@@ -515,9 +1057,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "================================================================================\n",
+ "BUILD-WIDE STATISTICS\n",
+ "================================================================================\n",
+ "Files processed: 1,253\n",
+ "Total events: 24,229,252\n",
+ "Total build time: 5915.00 minutes\n",
+ "Unique templates: 11,751,350\n",
+ "Template instantiations: 17,437,000\n",
+ "Template time: 2495.47 minutes (42.2%)\n",
+ "================================================================================\n"
+ ]
+ }
+ ],
"source": [
"if json_files and len(events_df) > 0:\n",
" total_build_time_us = events_df[\"dur\"].sum()\n",
@@ -550,9 +1109,326 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Aggregating template statistics...\n",
+ "Completed in 3.42s\n",
+ "\n",
+ "Top 20 Templates by Total Time:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_signature | \n",
+ " count | \n",
+ " total_s | \n",
+ " mean_ms | \n",
+ " median_ms | \n",
+ " depth | \n",
+ " pct_template_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 11751348 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 2 | \n",
+ " 135.700625 | \n",
+ " 67850.312500 | \n",
+ " 67850.3125 | \n",
+ " 2 | \n",
+ " 0.090631 | \n",
+ "
\n",
+ " \n",
+ " | 9135552 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135553 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135554 | \n",
+ " std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConvNdBwdDa... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135555 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 3 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135556 | \n",
+ " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 2 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135557 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135558 | \n",
+ " std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135559 | \n",
+ " std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135560 | \n",
+ " std::__uniq_ptr_impl<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135561 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135562 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135563 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135564 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 5 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135565 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Input_N_Hi_W... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 4 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135566 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135567 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135568 | \n",
+ " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 1 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135569 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 5 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ " | 9135570 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 3 | \n",
+ " 0.088275 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_signature \\\n",
+ "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
+ "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "9135554 std::_TupleConstraints, ck::Sequence<0, 1>>::operator... \n",
+ "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "9135558 std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... \n",
+ "9135559 std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... \n",
+ "9135560 std::__uniq_ptr_impl,... \n",
+ "9135565 std::tuple, ck::tensor... \n",
+ "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "9135568 ck::StaticBufferTupleOfVector 0:\n",
" # OPTIMIZATION: Aggregate FIRST, then join (much faster!)\n",
@@ -621,9 +1497,305 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Filtered to 9,433,574 CK templates (from 11,751,350 total)\n",
+ "CK template time: 66204864.83s\n",
+ "Percentage of total template time: 44216.7%\n",
+ "\n",
+ "Top 20 CK Templates by Total Time:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_signature | \n",
+ " count | \n",
+ " total_s | \n",
+ " mean_ms | \n",
+ " median_ms | \n",
+ " depth | \n",
+ " pct_ck_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 11751348 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 2 | \n",
+ " 135.700625 | \n",
+ " 67850.312500 | \n",
+ " 67850.3125 | \n",
+ " 2 | \n",
+ " 0.000205 | \n",
+ "
\n",
+ " \n",
+ " | 9135552 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135553 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135555 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 3 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135556 | \n",
+ " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 2 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135557 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135561 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135562 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135564 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 5 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135566 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135567 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 7 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135568 | \n",
+ " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 1 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135569 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 5 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135571 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 2 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135577 | \n",
+ " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::UnMerge<... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 5 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135579 | \n",
+ " ck::Tuple<ck::vector_type<float, 1>, ck::vector_type<float, 1>, ck::vector_t... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 2 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135582 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135583 | \n",
+ " ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 1 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135584 | \n",
+ " ck::to_multi_index<ck::Sequence<4, 12>> | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 2 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ " | 9135585 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso... | \n",
+ " 846 | \n",
+ " 132.173182 | \n",
+ " 156.233076 | \n",
+ " 1.0550 | \n",
+ " 6 | \n",
+ " 0.000200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_signature \\\n",
+ "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
+ "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "9135555 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
+ "9135556 ck::detail::static_ford_impl, ck::Sequence<0, 1>>::operator... \n",
+ "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "9135561 ck::StaticTensorTupleOfVectorBuffer,... \n",
+ "9135566 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "9135568 ck::StaticBufferTupleOfVector, ck::vector_type, ck::vector_t... \n",
+ "9135582 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n",
+ "9135583 ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> \n",
+ "9135584 ck::to_multi_index> \n",
+ "9135585 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n",
+ "\n",
+ " count total_s mean_ms median_ms depth pct_ck_time \n",
+ "11751348 2 135.700625 67850.312500 67850.3125 2 0.000205 \n",
+ "9135552 846 132.173182 156.233076 1.0550 6 0.000200 \n",
+ "9135553 846 132.173182 156.233076 1.0550 7 0.000200 \n",
+ "9135555 846 132.173182 156.233076 1.0550 3 0.000200 \n",
+ "9135556 846 132.173182 156.233076 1.0550 2 0.000200 \n",
+ "9135557 846 132.173182 156.233076 1.0550 6 0.000200 \n",
+ "9135561 846 132.173182 156.233076 1.0550 6 0.000200 \n",
+ "9135562 846 132.173182 156.233076 1.0550 6 0.000200 \n",
+ "9135564 846 132.173182 156.233076 1.0550 5 0.000200 \n",
+ "9135566 846 132.173182 156.233076 1.0550 7 0.000200 \n",
+ "9135567 846 132.173182 156.233076 1.0550 7 0.000200 \n",
+ "9135568 846 132.173182 156.233076 1.0550 1 0.000200 \n",
+ "9135569 846 132.173182 156.233076 1.0550 5 0.000200 \n",
+ "9135571 846 132.173182 156.233076 1.0550 2 0.000200 \n",
+ "9135577 846 132.173182 156.233076 1.0550 5 0.000200 \n",
+ "9135579 846 132.173182 156.233076 1.0550 2 0.000200 \n",
+ "9135582 846 132.173182 156.233076 1.0550 6 0.000200 \n",
+ "9135583 846 132.173182 156.233076 1.0550 1 0.000200 \n",
+ "9135584 846 132.173182 156.233076 1.0550 2 0.000200 \n",
+ "9135585 846 132.173182 156.233076 1.0550 6 0.000200 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if json_files and len(template_stats) > 0:\n",
" # Filter to only CK namespaces\n",
@@ -671,9 +1843,259 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 20 Most Frequently Instantiated Templates:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_signature | \n",
+ " count | \n",
+ " total_s | \n",
+ " mean_ms | \n",
+ " depth | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 7243825 | \n",
+ " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 7243826 | \n",
+ " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 7243827 | \n",
+ " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7243828 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7243829 | \n",
+ " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 7243830 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 7243831 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 7243832 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 7243833 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7243834 | \n",
+ " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 7243835 | \n",
+ " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 7243836 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 7243837 | \n",
+ " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 7243838 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 7243839 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 7243840 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 7243841 | \n",
+ " std::pair<const ck::BlockGemmPipelineScheduler, std::basic_string<char>>::pa... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 7243842 | \n",
+ " ck::make_tuple<ck::integral_constant<int, 1>, ck::integral_constant<int, 4>,... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 7243843 | \n",
+ " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, unsigned short, 64... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7243844 | \n",
+ " std::is_nothrow_constructible<std::unique_ptr<ck::tensor_operation::device::... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_signature \\\n",
+ "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n",
+ "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n",
+ "7243829 ck::utility::launch_and_time_kernel_with_preprocess>::pa... \n",
+ "7243842 ck::make_tuple, ck::integral_constant,... \n",
+ "7243843 ck::StaticBufferTupleOfVector 0:\n",
" print(\"Top 20 Most Frequently Instantiated Templates:\")\n",
@@ -699,9 +2121,209 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 15 Optimization Targets (High Frequency + High Cost):\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_signature | \n",
+ " count | \n",
+ " total_s | \n",
+ " mean_ms | \n",
+ " priority_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 7243825 | \n",
+ " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243826 | \n",
+ " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243827 | \n",
+ " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243828 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243829 | \n",
+ " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243830 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243831 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243832 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243833 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243834 | \n",
+ " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243835 | \n",
+ " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243836 | \n",
+ " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243837 | \n",
+ " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243838 | \n",
+ " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ " | 7243839 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
+ " 9406 | \n",
+ " 27.985313 | \n",
+ " 2.975262 | \n",
+ " 0.500016 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_signature \\\n",
+ "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n",
+ "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n",
+ "7243829 ck::utility::launch_and_time_kernel_with_preprocess 0:\n",
" # Normalize count and mean to 0-1 range\n",
@@ -736,9 +2358,158 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Template Instantiation by Nesting Depth:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " depth | \n",
+ " count | \n",
+ " total_s | \n",
+ " mean_ms | \n",
+ " median_ms | \n",
+ " pct_total | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 348050 | \n",
+ " 2.759851e+03 | \n",
+ " 8.315829 | \n",
+ " 1.22725 | \n",
+ " 1.843240 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 2007036528 | \n",
+ " 9.720831e+06 | \n",
+ " 6.516100 | \n",
+ " 1.22500 | \n",
+ " 6492.316278 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 4212093106 | \n",
+ " 2.378498e+07 | \n",
+ " 7.811773 | \n",
+ " 1.28000 | \n",
+ " 15885.434415 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 2137634038 | \n",
+ " 1.261997e+07 | \n",
+ " 7.344963 | \n",
+ " 1.27700 | \n",
+ " 8428.586130 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 2777069926 | \n",
+ " 1.801647e+07 | \n",
+ " 8.056019 | \n",
+ " 1.29000 | \n",
+ " 12032.783621 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " 1651653946 | \n",
+ " 1.169701e+07 | \n",
+ " 8.960966 | \n",
+ " 1.31100 | \n",
+ " 7812.159446 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 781689120 | \n",
+ " 8.147449e+06 | \n",
+ " 16.840407 | \n",
+ " 1.76800 | \n",
+ " 5441.491328 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " 69349065 | \n",
+ " 7.653282e+05 | \n",
+ " 16.927265 | \n",
+ " 1.79050 | \n",
+ " 511.144843 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 1516615 | \n",
+ " 1.622227e+04 | \n",
+ " 16.672371 | \n",
+ " 1.79625 | \n",
+ " 10.834476 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9 | \n",
+ " 2150 | \n",
+ " 1.411392e+01 | \n",
+ " 7.276202 | \n",
+ " 2.63050 | \n",
+ " 0.009426 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " depth count total_s mean_ms median_ms pct_total\n",
+ "0 0 348050 2.759851e+03 8.315829 1.22725 1.843240\n",
+ "1 1 2007036528 9.720831e+06 6.516100 1.22500 6492.316278\n",
+ "2 2 4212093106 2.378498e+07 7.811773 1.28000 15885.434415\n",
+ "3 3 2137634038 1.261997e+07 7.344963 1.27700 8428.586130\n",
+ "4 4 2777069926 1.801647e+07 8.056019 1.29000 12032.783621\n",
+ "5 5 1651653946 1.169701e+07 8.960966 1.31100 7812.159446\n",
+ "6 6 781689120 8.147449e+06 16.840407 1.76800 5441.491328\n",
+ "7 7 69349065 7.653282e+05 16.927265 1.79050 511.144843\n",
+ "8 8 1516615 1.622227e+04 16.672371 1.79625 10.834476\n",
+ "9 9 2150 1.411392e+01 7.276202 2.63050 0.009426"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if json_files and len(instantiations_df) > 0:\n",
" # OPTIMIZATION: Aggregate by depth directly from instantiations, then join\n",
@@ -790,9 +2561,133 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Template Argument Type Distribution:\n",
+ "Type Count Percentage\n",
+ "----------------------------------------\n",
+ "template 125,537,546 80.8%\n",
+ "primitive 19,045,151 12.3%\n",
+ "unknown 10,699,046 6.9%\n",
+ "\n",
+ "Templates with Most Arguments:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_signature | \n",
+ " arg_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 231420 | \n",
+ " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231421 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231422 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231423 | \n",
+ " ck::Tuple<_Float16 __attribute__((ext_vector_type(2))), _Float16 __attribute... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231424 | \n",
+ " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231425 | \n",
+ " ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231426 | \n",
+ " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231427 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231428 | \n",
+ " ck::vector_type<_Float16, 32>::(unnamed union at /home/AMD/jshumway/composab... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ " | 231429 | \n",
+ " ck::vector_type<signed char, 4>::(unnamed union at /home/AMD/jshumway/compos... | \n",
+ " 23716 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_signature \\\n",
+ "231420 ck::Tuple::(unnamed union at /home/AMD/j... \n",
+ "231425 ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... \n",
+ "231426 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n",
+ "231427 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/composab... \n",
+ "231429 ck::vector_type::(unnamed union at /home/AMD/jshumway/compos... \n",
+ "\n",
+ " arg_count \n",
+ "231420 23716 \n",
+ "231421 23716 \n",
+ "231422 23716 \n",
+ "231423 23716 \n",
+ "231424 23716 \n",
+ "231425 23716 \n",
+ "231426 23716 \n",
+ "231427 23716 \n",
+ "231428 23716 \n",
+ "231429 23716 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if json_files and len(template_args_df) > 0:\n",
" # Count argument types\n",
@@ -829,9 +2724,259 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 20 Event Types by Total Duration:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_type | \n",
+ " count | \n",
+ " total_min | \n",
+ " mean_ms | \n",
+ " pct_total | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 47 | \n",
+ " InstantiateFunction | \n",
+ " 15006405 | \n",
+ " 2069.145884 | \n",
+ " 8.273051 | \n",
+ " 34.981333 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " ExecuteCompiler | \n",
+ " 1251 | \n",
+ " 445.214199 | \n",
+ " 21353.198979 | \n",
+ " 7.526867 | \n",
+ "
\n",
+ " \n",
+ " | 143 | \n",
+ " Total ExecuteCompiler | \n",
+ " 1251 | \n",
+ " 445.214188 | \n",
+ " 21353.198487 | \n",
+ " 7.526867 | \n",
+ "
\n",
+ " \n",
+ " | 46 | \n",
+ " InstantiateClass | \n",
+ " 2430595 | \n",
+ " 426.324732 | \n",
+ " 10.523960 | \n",
+ " 7.207519 | \n",
+ "
\n",
+ " \n",
+ " | 147 | \n",
+ " Total Frontend | \n",
+ " 1251 | \n",
+ " 384.832270 | \n",
+ " 18457.183201 | \n",
+ " 6.506040 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " Frontend | \n",
+ " 2419 | \n",
+ " 384.832095 | \n",
+ " 9545.235926 | \n",
+ " 6.506037 | \n",
+ "
\n",
+ " \n",
+ " | 241 | \n",
+ " Total Source | \n",
+ " 1251 | \n",
+ " 232.585137 | \n",
+ " 11155.162444 | \n",
+ " 3.932124 | \n",
+ "
\n",
+ " \n",
+ " | 168 | \n",
+ " Total InstantiateFunction | \n",
+ " 1251 | \n",
+ " 187.199754 | \n",
+ " 8978.405449 | \n",
+ " 3.164831 | \n",
+ "
\n",
+ " \n",
+ " | 75 | \n",
+ " PerformPendingInstantiations | \n",
+ " 1251 | \n",
+ " 109.696772 | \n",
+ " 5261.236084 | \n",
+ " 1.854552 | \n",
+ "
\n",
+ " \n",
+ " | 217 | \n",
+ " Total PerformPendingInstantiations | \n",
+ " 1251 | \n",
+ " 109.696762 | \n",
+ " 5261.235595 | \n",
+ " 1.854552 | \n",
+ "
\n",
+ " \n",
+ " | 167 | \n",
+ " Total InstantiateClass | \n",
+ " 1251 | \n",
+ " 101.970700 | \n",
+ " 4890.681049 | \n",
+ " 1.723934 | \n",
+ "
\n",
+ " \n",
+ " | 211 | \n",
+ " Total ParseClass | \n",
+ " 1251 | \n",
+ " 65.367502 | \n",
+ " 3135.131968 | \n",
+ " 1.105114 | \n",
+ "
\n",
+ " \n",
+ " | 69 | \n",
+ " ParseClass | \n",
+ " 615160 | \n",
+ " 60.689238 | \n",
+ " 5.919361 | \n",
+ " 1.026023 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Backend | \n",
+ " 1251 | \n",
+ " 57.471167 | \n",
+ " 2756.410890 | \n",
+ " 0.971617 | \n",
+ "
\n",
+ " \n",
+ " | 104 | \n",
+ " Total Backend | \n",
+ " 1251 | \n",
+ " 57.471157 | \n",
+ " 2756.410412 | \n",
+ " 0.971617 | \n",
+ "
\n",
+ " \n",
+ " | 70 | \n",
+ " ParseDeclarationOrFunctionDefinition | \n",
+ " 377890 | \n",
+ " 50.231873 | \n",
+ " 7.975634 | \n",
+ " 0.849229 | \n",
+ "
\n",
+ " \n",
+ " | 212 | \n",
+ " Total ParseDeclarationOrFunctionDefinition | \n",
+ " 1251 | \n",
+ " 44.632077 | \n",
+ " 2140.627196 | \n",
+ " 0.754557 | \n",
+ "
\n",
+ " \n",
+ " | 71 | \n",
+ " ParseFunctionDefinition | \n",
+ " 410465 | \n",
+ " 38.754508 | \n",
+ " 5.664967 | \n",
+ " 0.655190 | \n",
+ "
\n",
+ " \n",
+ " | 213 | \n",
+ " Total ParseFunctionDefinition | \n",
+ " 1251 | \n",
+ " 38.036852 | \n",
+ " 1824.309444 | \n",
+ " 0.643057 | \n",
+ "
\n",
+ " \n",
+ " | 68 | \n",
+ " Optimizer | \n",
+ " 1251 | \n",
+ " 37.729725 | \n",
+ " 1809.579141 | \n",
+ " 0.637865 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_type count total_min \\\n",
+ "47 InstantiateFunction 15006405 2069.145884 \n",
+ "30 ExecuteCompiler 1251 445.214199 \n",
+ "143 Total ExecuteCompiler 1251 445.214188 \n",
+ "46 InstantiateClass 2430595 426.324732 \n",
+ "147 Total Frontend 1251 384.832270 \n",
+ "32 Frontend 2419 384.832095 \n",
+ "241 Total Source 1251 232.585137 \n",
+ "168 Total InstantiateFunction 1251 187.199754 \n",
+ "75 PerformPendingInstantiations 1251 109.696772 \n",
+ "217 Total PerformPendingInstantiations 1251 109.696762 \n",
+ "167 Total InstantiateClass 1251 101.970700 \n",
+ "211 Total ParseClass 1251 65.367502 \n",
+ "69 ParseClass 615160 60.689238 \n",
+ "4 Backend 1251 57.471167 \n",
+ "104 Total Backend 1251 57.471157 \n",
+ "70 ParseDeclarationOrFunctionDefinition 377890 50.231873 \n",
+ "212 Total ParseDeclarationOrFunctionDefinition 1251 44.632077 \n",
+ "71 ParseFunctionDefinition 410465 38.754508 \n",
+ "213 Total ParseFunctionDefinition 1251 38.036852 \n",
+ "68 Optimizer 1251 37.729725 \n",
+ "\n",
+ " mean_ms pct_total \n",
+ "47 8.273051 34.981333 \n",
+ "30 21353.198979 7.526867 \n",
+ "143 21353.198487 7.526867 \n",
+ "46 10.523960 7.207519 \n",
+ "147 18457.183201 6.506040 \n",
+ "32 9545.235926 6.506037 \n",
+ "241 11155.162444 3.932124 \n",
+ "168 8978.405449 3.164831 \n",
+ "75 5261.236084 1.854552 \n",
+ "217 5261.235595 1.854552 \n",
+ "167 4890.681049 1.723934 \n",
+ "211 3135.131968 1.105114 \n",
+ "69 5.919361 1.026023 \n",
+ "4 2756.410890 0.971617 \n",
+ "104 2756.410412 0.971617 \n",
+ "70 7.975634 0.849229 \n",
+ "212 2140.627196 0.754557 \n",
+ "71 5.664967 0.655190 \n",
+ "213 1824.309444 0.643057 \n",
+ "68 1809.579141 0.637865 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"if json_files and len(events_df) > 0:\n",
" event_stats = (\n",
@@ -873,9 +3018,118 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found ninja log: ../../../build-trace/.ninja_log\n",
+ "Parsed 2,579 build events in 0.004s\n",
+ "\n",
+ "Builds DataFrame: 2,579 rows\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " start_ms | \n",
+ " end_ms | \n",
+ " cmd_hash | \n",
+ " worker_id | \n",
+ " duration_ms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /home/AMD/jshumway/composable_kernel/build-trace/CMakeFiles/cmake.verify_globs | \n",
+ " 487 | \n",
+ " 511 | \n",
+ " 20b4a9dba5446149 | \n",
+ " -1 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " build.ninja | \n",
+ " 512 | \n",
+ " 159966 | \n",
+ " 80f2b673d9d1b995 | \n",
+ " -1 | \n",
+ " 159454 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... | \n",
+ " 228 | \n",
+ " 9471 | \n",
+ " b37d14fd75e2d29c | \n",
+ " -1 | \n",
+ " 9243 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " _deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o | \n",
+ " 103 | \n",
+ " 9527 | \n",
+ " 330bec9abe978cde | \n",
+ " -1 | \n",
+ " 9424 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... | \n",
+ " 243 | \n",
+ " 9687 | \n",
+ " 6a5b0a35a99473e8 | \n",
+ " -1 | \n",
+ " 9444 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target \\\n",
+ "0 /home/AMD/jshumway/composable_kernel/build-trace/CMakeFiles/cmake.verify_globs \n",
+ "1 build.ninja \n",
+ "2 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... \n",
+ "3 _deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o \n",
+ "4 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... \n",
+ "\n",
+ " start_ms end_ms cmd_hash worker_id duration_ms \n",
+ "0 487 511 20b4a9dba5446149 -1 24 \n",
+ "1 512 159966 80f2b673d9d1b995 -1 159454 \n",
+ "2 228 9471 b37d14fd75e2d29c -1 9243 \n",
+ "3 103 9527 330bec9abe978cde -1 9424 \n",
+ "4 243 9687 6a5b0a35a99473e8 -1 9444 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"from trace_analysis import NinjaLogParser\n",
"\n",