From 814b609476ef409298a2ef8de3ebd075ce6b6f9d Mon Sep 17 00:00:00 2001 From: John Shumway Date: Thu, 18 Dec 2025 19:48:34 -0500 Subject: [PATCH] Update build analyzer for better usability --- script/analyze_build/README.md | 18 +- .../notebooks/comprehensive_example.ipynb | 2330 ++++++++++++++++- 2 files changed, 2307 insertions(+), 41 deletions(-) diff --git a/script/analyze_build/README.md b/script/analyze_build/README.md index 582a01f91d..4a1bcca50c 100644 --- a/script/analyze_build/README.md +++ b/script/analyze_build/README.md @@ -6,10 +6,17 @@ Simple, fast tools for analyzing Clang `-ftime-trace` build performance data. This directory provides straightforward Python tools for analyzing the JSON trace files generated during compilation with `-ftime-trace`. The focus is on simplicity and speed - no caching, no complexity, just fast parallel I/O and pandas DataFrames. -**Key principle: Fresh analysis every time is faster and simpler than managing caches.** - ## Quick Start +Configure a build directory `build-trace` and edit CMakeCache to add `-ftime-trace` to the `CMAKE_CXX_FLAGS`. With `-ftime-trace` enabled, the clang compiler will generate `.json` trace files alongside each compiled object file. These trace files contain detailed timing information about: + +- Template instantiations +- Function parsing +- Code generation phases +- Optimization passes + +These JSON files are what the analysis tools in `script/analyze_build/` are designed to process. + ```bash # Analyze all trace files in a directory cd script/analyze_build/examples @@ -57,6 +64,7 @@ python examples/analyze_build.py ../../build-trace ``` This will: + - Find all `.json` files recursively - Process them in parallel using all CPU cores - Display comprehensive build statistics @@ -93,6 +101,7 @@ print(f"Template time: {templates_df['dur'].sum() / 1e6:.2f}s") For interactive analysis, see the comprehensive example notebook: **[notebooks/comprehensive_example.ipynb](notebooks/comprehensive_example.ipynb)** - Complete guide covering: + - Single file analysis with detailed explanations - Multi-file parallel processing - Build-wide statistics and template analysis @@ -130,11 +139,13 @@ print(event_totals.head(10)) ## Performance **Typical performance on 4,484 trace files (~46 GB):** + - Parsing: ~26 seconds (174 files/sec) - Memory: ~1-2 GB - Throughput: I/O limited (uses all CPU cores) **Why no caching?** + - Fresh analysis is faster than cache management overhead - Simpler code (60% less code than cached version) - No cache invalidation issues @@ -164,6 +175,7 @@ The trace files use the [Chrome Trace Event Format](https://docs.google.com/docu ``` **Key fields:** + - `name`: Event type (e.g., "InstantiateClass", "ParseFunctionDefinition") - `dur`: Duration in microseconds - `ts`: Timestamp in microseconds @@ -248,7 +260,7 @@ template_time = templates_df['dur'].sum() print(f"Template time: {(template_time / total_time) * 100:.1f}%") ``` -## Tips +## Build time analysis philosophy - **Use all CPU cores**: The tools automatically use all available cores for parallel processing - **Memory is cheap**: 1-2GB for 4,484 files is acceptable on modern systems diff --git a/script/analyze_build/notebooks/comprehensive_example.ipynb b/script/analyze_build/notebooks/comprehensive_example.ipynb index a48ac0fbbb..491e373f07 100644 --- a/script/analyze_build/notebooks/comprehensive_example.ipynb +++ b/script/analyze_build/notebooks/comprehensive_example.ipynb @@ -40,9 +40,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: Install plotly for visualizations: pip install plotly\n", + "Using 384 CPU cores for parallel processing\n", + "Pandas version: 2.3.3\n" + ] + } + ], "source": [ "from importlib.util import find_spec\n", "from multiprocessing import cpu_count\n", @@ -88,9 +98,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 1,253 trace files\n", + "\n", + "Using sample file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n", + "File size: 11652.5 KB\n" + ] + } + ], "source": [ "# Configure the path to your trace files\n", "TRACE_DIR = Path(\"../../../build-trace\")\n", @@ -127,9 +148,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsed 15,110 events in 0.043s\n", + "Transformed to Pandas tables in 1.115s\n", + "\n", + "Pandas DataFrames:\n", + " templates : 8,703 rows, 8.10 MB | template_id, template_name, full_signature, depth, arg_count\n", + " instantiations : 9,838 rows, 0.31 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n", + " template_args : 51,474 rows, 8.73 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n", + " events : 15,110 rows, 0.53 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n" + ] + } + ], "source": [ "if sample_files:\n", " # Parse the trace file\n", @@ -172,9 +208,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compilation Unit Summary:\n", + " Trace file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n", + " Trace file size: 11652.5 KB\n", + " Start time: 2026-01-03 22:51:51.980489\n", + " Total compilation time: 178.01s\n", + " Total events: 15,110\n" + ] + } + ], "source": [ "if sample_files:\n", " print(\"Compilation Unit Summary:\")\n", @@ -196,9 +245,187 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Templates DataFrame Schema:\n", + "Column Type Memory (MB) % of Total\n", + "-------------------------------------------------------------------\n", + "template_id int32 0.03 0.4%\n", + "template_name category 0.03 0.4%\n", + "full_signature object 8.02 99.0%\n", + "depth int8 0.01 0.1%\n", + "arg_count int8 0.01 0.1%\n", + "Index RangeIndex 0.00 0.0%\n", + "-------------------------------------------------------------------\n", + "TOTAL 8.10 100.0%\n", + "\n", + "Total templates: 8,703\n", + "CK templates: 6,331 (72.7%)\n", + "Other templates: 2,372\n", + "\n", + "Sample CK templates:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
template_idtemplate_namefull_signaturedeptharg_count
4545ck::Tupleck::Tuple<ck::integral_constant<int, 2>, _BitInt(4)>22
4646ck::Tupleck::Tuple<ck::integral_constant<int, 16>, _BitInt(6)>22
4747ck::Tupleck::Tuple<float, float>12
4848ck::vector_typeck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k...12
4949ck::vector_typeck::vector_type<float, 2>12
5050ck::Tupleck::Tuple<float, float, float, float>14
5151ck::vector_typeck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k...12
5252ck::vector_typeck::vector_type<float, 4>12
5353ck::Tupleck::Tuple<float, float, float, float, float, float, float, float>18
5454ck::detail::TupleElementKeyDatack::detail::TupleElementKeyData<ck::detail::TupleElementKey<3>, float __attr...22
\n", + "
" + ], + "text/plain": [ + " template_id template_name \\\n", + "45 45 ck::Tuple \n", + "46 46 ck::Tuple \n", + "47 47 ck::Tuple \n", + "48 48 ck::vector_type \n", + "49 49 ck::vector_type \n", + "50 50 ck::Tuple \n", + "51 51 ck::vector_type \n", + "52 52 ck::vector_type \n", + "53 53 ck::Tuple \n", + "54 54 ck::detail::TupleElementKeyData \n", + "\n", + " full_signature \\\n", + "45 ck::Tuple, _BitInt(4)> \n", + "46 ck::Tuple, _BitInt(6)> \n", + "47 ck::Tuple \n", + "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", + "49 ck::vector_type \n", + "50 ck::Tuple \n", + "51 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", + "52 ck::vector_type \n", + "53 ck::Tuple \n", + "54 ck::detail::TupleElementKeyData, float __attr... \n", + "\n", + " depth arg_count \n", + "45 2 2 \n", + "46 2 2 \n", + "47 1 2 \n", + "48 1 2 \n", + "49 1 2 \n", + "50 1 4 \n", + "51 1 2 \n", + "52 1 2 \n", + "53 1 8 \n", + "54 2 2 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if sample_files:\n", " templates_df = tables[\"templates\"]\n", @@ -250,9 +477,166 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Template Instantiation Summary:\n", + " Unique templates: 8,703\n", + " Total instantiations: 9,838\n", + " Template time: 39.65s\n", + " Percentage of build: 22.3%\n", + " Avg per instantiation: 4.03 ms\n", + "\n", + "Top 10 Templates by Total Time:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
signaturecounttotal_msmean_msdepth
670ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...12444.00537.0004171
669ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...12442.66536.8887501
1411ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10>::operator()<(lamb...34365.62810.7537651
6424ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>::operator...2346.742173.3710001
4079ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...4330.92882.7320001
4078ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...4330.38082.5950001
4859ck::transform_tensor_descriptor<ck::TensorDescriptor<ck::Tuple<ck::UnMerge<c...1327.756327.7560006
1470ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...6311.49051.9150001
1469ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...6310.72951.7881671
2362ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...5295.89059.1780001
\n", + "
" + ], + "text/plain": [ + " signature \\\n", + "670 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n", + "669 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n", + "1411 ck::detail::applier::operator()<(lamb... \n", + "6424 ck::detail::applier::operator... \n", + "4079 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n", + "4078 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n", + "4859 ck::transform_tensor_descriptor 0:\n", " # Join instantiations with templates\n", @@ -316,9 +700,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Template Instantiation by Nesting Depth:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
depthcounttotal_msmean_ms
0116288510.4355.227540
1219625433.0742.769151
2313115042.4203.846240
3425487730.9893.034140
45211210514.2704.978348
562572398.9219.334323
672021.9441.097200
\n", + "
" + ], + "text/plain": [ + " depth count total_ms mean_ms\n", + "0 1 1628 8510.435 5.227540\n", + "1 2 1962 5433.074 2.769151\n", + "2 3 1311 5042.420 3.846240\n", + "3 4 2548 7730.989 3.034140\n", + "4 5 2112 10514.270 4.978348\n", + "5 6 257 2398.921 9.334323\n", + "6 7 20 21.944 1.097200" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if sample_files and len(tables[\"templates\"]) > 0:\n", " depth_stats = (\n", @@ -347,9 +834,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parallel processing function defined\n" + ] + } + ], "source": [ "from concurrent.futures import ProcessPoolExecutor, as_completed\n", "\n", @@ -380,9 +875,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 1,253 trace files\n", + "Total size: 18.79 GB\n" + ] + } + ], "source": [ "# Find all trace files\n", "json_files = find_trace_files(TRACE_DIR)\n", @@ -408,9 +912,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 1,253 files with 384 workers...\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/AMD/jshumway/composable_kernel/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Processing: 100%|██████████| 1253/1253 [00:46<00:00, 27.13files/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Parsing complete in 48.54s (25.8 files/sec)\n", + "\n", + "Combining results...\n", + "Combined in 4.73s\n", + "\n", + "Total analysis time: 53.27s\n", + "\n", + "Combined Tables:\n", + " Templates: 11,751,350 rows\n", + " Instantiations: 17,437,000 rows\n", + " Template Args: 155,281,743 rows\n", + " Events: 24,229,252 rows\n", + " Total memory: 34.06 GB\n" + ] + } + ], "source": [ "if json_files:\n", " print(f\"Processing {len(json_files):,} files with {cpu_count()} workers...\\n\")\n", @@ -515,9 +1057,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "BUILD-WIDE STATISTICS\n", + "================================================================================\n", + "Files processed: 1,253\n", + "Total events: 24,229,252\n", + "Total build time: 5915.00 minutes\n", + "Unique templates: 11,751,350\n", + "Template instantiations: 17,437,000\n", + "Template time: 2495.47 minutes (42.2%)\n", + "================================================================================\n" + ] + } + ], "source": [ "if json_files and len(events_df) > 0:\n", " total_build_time_us = events_df[\"dur\"].sum()\n", @@ -550,9 +1109,326 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aggregating template statistics...\n", + "Completed in 3.42s\n", + "\n", + "Top 20 Templates by Total Time:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_signaturecounttotal_smean_msmedian_msdepthpct_template_time
11751348ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...2135.70062567850.31250067850.312520.090631
9135552ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...846132.173182156.2330761.055060.088275
9135553ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...846132.173182156.2330761.055070.088275
9135554std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConvNdBwdDa...846132.173182156.2330761.055040.088275
9135555ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...846132.173182156.2330761.055030.088275
9135556ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator...846132.173182156.2330761.055020.088275
9135557ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor...846132.173182156.2330761.055060.088275
9135558std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_...846132.173182156.2330761.055040.088275
9135559std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp...846132.173182156.2330761.055040.088275
9135560std::__uniq_ptr_impl<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In...846132.173182156.2330761.055040.088275
9135561ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...846132.173182156.2330761.055060.088275
9135562ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...846132.173182156.2330761.055060.088275
9135563std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...846132.173182156.2330761.055040.088275
9135564ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...846132.173182156.2330761.055050.088275
9135565std::tuple<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Input_N_Hi_W...846132.173182156.2330761.055040.088275
9135566ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor...846132.173182156.2330761.055070.088275
9135567ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_...846132.173182156.2330761.055070.088275
9135568ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t...846132.173182156.2330761.055010.088275
9135569ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...846132.173182156.2330761.055050.088275
9135570std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...846132.173182156.2330761.055030.088275
\n", + "
" + ], + "text/plain": [ + " full_signature \\\n", + "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", + "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "9135554 std::_TupleConstraints, ck::Sequence<0, 1>>::operator... \n", + "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "9135558 std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... \n", + "9135559 std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... \n", + "9135560 std::__uniq_ptr_impl,... \n", + "9135565 std::tuple, ck::tensor... \n", + "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "9135568 ck::StaticBufferTupleOfVector 0:\n", " # OPTIMIZATION: Aggregate FIRST, then join (much faster!)\n", @@ -621,9 +1497,305 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered to 9,433,574 CK templates (from 11,751,350 total)\n", + "CK template time: 66204864.83s\n", + "Percentage of total template time: 44216.7%\n", + "\n", + "Top 20 CK Templates by Total Time:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_signaturecounttotal_smean_msmedian_msdepthpct_ck_time
11751348ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...2135.70062567850.31250067850.312520.000205
9135552ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...846132.173182156.2330761.055060.000200
9135553ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...846132.173182156.2330761.055070.000200
9135555ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...846132.173182156.2330761.055030.000200
9135556ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator...846132.173182156.2330761.055020.000200
9135557ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor...846132.173182156.2330761.055060.000200
9135561ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...846132.173182156.2330761.055060.000200
9135562ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...846132.173182156.2330761.055060.000200
9135564ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...846132.173182156.2330761.055050.000200
9135566ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor...846132.173182156.2330761.055070.000200
9135567ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_...846132.173182156.2330761.055070.000200
9135568ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t...846132.173182156.2330761.055010.000200
9135569ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...846132.173182156.2330761.055050.000200
9135571ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle...846132.173182156.2330761.055020.000200
9135577ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::UnMerge<...846132.173182156.2330761.055050.000200
9135579ck::Tuple<ck::vector_type<float, 1>, ck::vector_type<float, 1>, ck::vector_t...846132.173182156.2330761.055020.000200
9135582ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso...846132.173182156.2330761.055060.000200
9135583ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5>846132.173182156.2330761.055010.000200
9135584ck::to_multi_index<ck::Sequence<4, 12>>846132.173182156.2330761.055020.000200
9135585ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso...846132.173182156.2330761.055060.000200
\n", + "
" + ], + "text/plain": [ + " full_signature \\\n", + "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", + "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "9135555 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", + "9135556 ck::detail::static_ford_impl, ck::Sequence<0, 1>>::operator... \n", + "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "9135561 ck::StaticTensorTupleOfVectorBuffer,... \n", + "9135566 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "9135568 ck::StaticBufferTupleOfVector, ck::vector_type, ck::vector_t... \n", + "9135582 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n", + "9135583 ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> \n", + "9135584 ck::to_multi_index> \n", + "9135585 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n", + "\n", + " count total_s mean_ms median_ms depth pct_ck_time \n", + "11751348 2 135.700625 67850.312500 67850.3125 2 0.000205 \n", + "9135552 846 132.173182 156.233076 1.0550 6 0.000200 \n", + "9135553 846 132.173182 156.233076 1.0550 7 0.000200 \n", + "9135555 846 132.173182 156.233076 1.0550 3 0.000200 \n", + "9135556 846 132.173182 156.233076 1.0550 2 0.000200 \n", + "9135557 846 132.173182 156.233076 1.0550 6 0.000200 \n", + "9135561 846 132.173182 156.233076 1.0550 6 0.000200 \n", + "9135562 846 132.173182 156.233076 1.0550 6 0.000200 \n", + "9135564 846 132.173182 156.233076 1.0550 5 0.000200 \n", + "9135566 846 132.173182 156.233076 1.0550 7 0.000200 \n", + "9135567 846 132.173182 156.233076 1.0550 7 0.000200 \n", + "9135568 846 132.173182 156.233076 1.0550 1 0.000200 \n", + "9135569 846 132.173182 156.233076 1.0550 5 0.000200 \n", + "9135571 846 132.173182 156.233076 1.0550 2 0.000200 \n", + "9135577 846 132.173182 156.233076 1.0550 5 0.000200 \n", + "9135579 846 132.173182 156.233076 1.0550 2 0.000200 \n", + "9135582 846 132.173182 156.233076 1.0550 6 0.000200 \n", + "9135583 846 132.173182 156.233076 1.0550 1 0.000200 \n", + "9135584 846 132.173182 156.233076 1.0550 2 0.000200 \n", + "9135585 846 132.173182 156.233076 1.0550 6 0.000200 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if json_files and len(template_stats) > 0:\n", " # Filter to only CK namespaces\n", @@ -671,9 +1843,259 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 20 Most Frequently Instantiated Templates:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_signaturecounttotal_smean_msdepth
7243825ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:...940627.9853132.9752623
7243826std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev...940627.9853132.9752623
7243827ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr...940627.9853132.9752621
7243828ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...940627.9853132.9752626
7243829ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM...940627.9853132.9752625
7243830std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...940627.9853132.9752624
7243831std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...940627.9853132.9752624
7243832ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...940627.9853132.9752625
7243833ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...940627.9853132.9752621
7243834std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti...940627.9853132.9752624
7243835std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::...940627.9853132.9752625
7243836std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...940627.9853132.9752623
7243837std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:...940627.9853132.9752625
7243838std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...940627.9853132.9752624
7243839ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...940627.9853132.9752625
7243840std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...940627.9853132.9752624
7243841std::pair<const ck::BlockGemmPipelineScheduler, std::basic_string<char>>::pa...940627.9853132.9752622
7243842ck::make_tuple<ck::integral_constant<int, 1>, ck::integral_constant<int, 4>,...940627.9853132.9752622
7243843ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, unsigned short, 64...940627.9853132.9752621
7243844std::is_nothrow_constructible<std::unique_ptr<ck::tensor_operation::device::...940627.9853132.9752624
\n", + "
" + ], + "text/plain": [ + " full_signature \\\n", + "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n", + "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n", + "7243829 ck::utility::launch_and_time_kernel_with_preprocess>::pa... \n", + "7243842 ck::make_tuple, ck::integral_constant,... \n", + "7243843 ck::StaticBufferTupleOfVector 0:\n", " print(\"Top 20 Most Frequently Instantiated Templates:\")\n", @@ -699,9 +2121,209 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 15 Optimization Targets (High Frequency + High Cost):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_signaturecounttotal_smean_mspriority_score
7243825ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:...940627.9853132.9752620.500016
7243826std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev...940627.9853132.9752620.500016
7243827ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr...940627.9853132.9752620.500016
7243828ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...940627.9853132.9752620.500016
7243829ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM...940627.9853132.9752620.500016
7243830std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...940627.9853132.9752620.500016
7243831std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...940627.9853132.9752620.500016
7243832ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...940627.9853132.9752620.500016
7243833ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...940627.9853132.9752620.500016
7243834std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti...940627.9853132.9752620.500016
7243835std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::...940627.9853132.9752620.500016
7243836std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...940627.9853132.9752620.500016
7243837std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:...940627.9853132.9752620.500016
7243838std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...940627.9853132.9752620.500016
7243839ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...940627.9853132.9752620.500016
\n", + "
" + ], + "text/plain": [ + " full_signature \\\n", + "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n", + "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n", + "7243829 ck::utility::launch_and_time_kernel_with_preprocess 0:\n", " # Normalize count and mean to 0-1 range\n", @@ -736,9 +2358,158 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Template Instantiation by Nesting Depth:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
depthcounttotal_smean_msmedian_mspct_total
003480502.759851e+038.3158291.227251.843240
1120070365289.720831e+066.5161001.225006492.316278
2242120931062.378498e+077.8117731.2800015885.434415
3321376340381.261997e+077.3449631.277008428.586130
4427770699261.801647e+078.0560191.2900012032.783621
5516516539461.169701e+078.9609661.311007812.159446
667816891208.147449e+0616.8404071.768005441.491328
77693490657.653282e+0516.9272651.79050511.144843
8815166151.622227e+0416.6723711.7962510.834476
9921501.411392e+017.2762022.630500.009426
\n", + "
" + ], + "text/plain": [ + " depth count total_s mean_ms median_ms pct_total\n", + "0 0 348050 2.759851e+03 8.315829 1.22725 1.843240\n", + "1 1 2007036528 9.720831e+06 6.516100 1.22500 6492.316278\n", + "2 2 4212093106 2.378498e+07 7.811773 1.28000 15885.434415\n", + "3 3 2137634038 1.261997e+07 7.344963 1.27700 8428.586130\n", + "4 4 2777069926 1.801647e+07 8.056019 1.29000 12032.783621\n", + "5 5 1651653946 1.169701e+07 8.960966 1.31100 7812.159446\n", + "6 6 781689120 8.147449e+06 16.840407 1.76800 5441.491328\n", + "7 7 69349065 7.653282e+05 16.927265 1.79050 511.144843\n", + "8 8 1516615 1.622227e+04 16.672371 1.79625 10.834476\n", + "9 9 2150 1.411392e+01 7.276202 2.63050 0.009426" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if json_files and len(instantiations_df) > 0:\n", " # OPTIMIZATION: Aggregate by depth directly from instantiations, then join\n", @@ -790,9 +2561,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Template Argument Type Distribution:\n", + "Type Count Percentage\n", + "----------------------------------------\n", + "template 125,537,546 80.8%\n", + "primitive 19,045,151 12.3%\n", + "unknown 10,699,046 6.9%\n", + "\n", + "Templates with Most Arguments:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_signaturearg_count
231420ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8...23716
231421ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...23716
231422ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...23716
231423ck::Tuple<_Float16 __attribute__((ext_vector_type(2))), _Float16 __attribute...23716
231424ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j...23716
231425ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float...23716
231426ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j...23716
231427ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...23716
231428ck::vector_type<_Float16, 32>::(unnamed union at /home/AMD/jshumway/composab...23716
231429ck::vector_type<signed char, 4>::(unnamed union at /home/AMD/jshumway/compos...23716
\n", + "
" + ], + "text/plain": [ + " full_signature \\\n", + "231420 ck::Tuple::(unnamed union at /home/AMD/j... \n", + "231425 ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... \n", + "231426 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n", + "231427 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/composab... \n", + "231429 ck::vector_type::(unnamed union at /home/AMD/jshumway/compos... \n", + "\n", + " arg_count \n", + "231420 23716 \n", + "231421 23716 \n", + "231422 23716 \n", + "231423 23716 \n", + "231424 23716 \n", + "231425 23716 \n", + "231426 23716 \n", + "231427 23716 \n", + "231428 23716 \n", + "231429 23716 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if json_files and len(template_args_df) > 0:\n", " # Count argument types\n", @@ -829,9 +2724,259 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 20 Event Types by Total Duration:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typecounttotal_minmean_mspct_total
47InstantiateFunction150064052069.1458848.27305134.981333
30ExecuteCompiler1251445.21419921353.1989797.526867
143Total ExecuteCompiler1251445.21418821353.1984877.526867
46InstantiateClass2430595426.32473210.5239607.207519
147Total Frontend1251384.83227018457.1832016.506040
32Frontend2419384.8320959545.2359266.506037
241Total Source1251232.58513711155.1624443.932124
168Total InstantiateFunction1251187.1997548978.4054493.164831
75PerformPendingInstantiations1251109.6967725261.2360841.854552
217Total PerformPendingInstantiations1251109.6967625261.2355951.854552
167Total InstantiateClass1251101.9707004890.6810491.723934
211Total ParseClass125165.3675023135.1319681.105114
69ParseClass61516060.6892385.9193611.026023
4Backend125157.4711672756.4108900.971617
104Total Backend125157.4711572756.4104120.971617
70ParseDeclarationOrFunctionDefinition37789050.2318737.9756340.849229
212Total ParseDeclarationOrFunctionDefinition125144.6320772140.6271960.754557
71ParseFunctionDefinition41046538.7545085.6649670.655190
213Total ParseFunctionDefinition125138.0368521824.3094440.643057
68Optimizer125137.7297251809.5791410.637865
\n", + "
" + ], + "text/plain": [ + " event_type count total_min \\\n", + "47 InstantiateFunction 15006405 2069.145884 \n", + "30 ExecuteCompiler 1251 445.214199 \n", + "143 Total ExecuteCompiler 1251 445.214188 \n", + "46 InstantiateClass 2430595 426.324732 \n", + "147 Total Frontend 1251 384.832270 \n", + "32 Frontend 2419 384.832095 \n", + "241 Total Source 1251 232.585137 \n", + "168 Total InstantiateFunction 1251 187.199754 \n", + "75 PerformPendingInstantiations 1251 109.696772 \n", + "217 Total PerformPendingInstantiations 1251 109.696762 \n", + "167 Total InstantiateClass 1251 101.970700 \n", + "211 Total ParseClass 1251 65.367502 \n", + "69 ParseClass 615160 60.689238 \n", + "4 Backend 1251 57.471167 \n", + "104 Total Backend 1251 57.471157 \n", + "70 ParseDeclarationOrFunctionDefinition 377890 50.231873 \n", + "212 Total ParseDeclarationOrFunctionDefinition 1251 44.632077 \n", + "71 ParseFunctionDefinition 410465 38.754508 \n", + "213 Total ParseFunctionDefinition 1251 38.036852 \n", + "68 Optimizer 1251 37.729725 \n", + "\n", + " mean_ms pct_total \n", + "47 8.273051 34.981333 \n", + "30 21353.198979 7.526867 \n", + "143 21353.198487 7.526867 \n", + "46 10.523960 7.207519 \n", + "147 18457.183201 6.506040 \n", + "32 9545.235926 6.506037 \n", + "241 11155.162444 3.932124 \n", + "168 8978.405449 3.164831 \n", + "75 5261.236084 1.854552 \n", + "217 5261.235595 1.854552 \n", + "167 4890.681049 1.723934 \n", + "211 3135.131968 1.105114 \n", + "69 5.919361 1.026023 \n", + "4 2756.410890 0.971617 \n", + "104 2756.410412 0.971617 \n", + "70 7.975634 0.849229 \n", + "212 2140.627196 0.754557 \n", + "71 5.664967 0.655190 \n", + "213 1824.309444 0.643057 \n", + "68 1809.579141 0.637865 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "if json_files and len(events_df) > 0:\n", " event_stats = (\n", @@ -873,9 +3018,118 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found ninja log: ../../../build-trace/.ninja_log\n", + "Parsed 2,579 build events in 0.004s\n", + "\n", + "Builds DataFrame: 2,579 rows\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetstart_msend_mscmd_hashworker_idduration_ms
0/home/AMD/jshumway/composable_kernel/build-trace/CMakeFiles/cmake.verify_globs48751120b4a9dba5446149-124
1build.ninja51215996680f2b673d9d1b995-1159454
2library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi...2289471b37d14fd75e2d29c-19243
3_deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o1039527330bec9abe978cde-19424
4library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi...24396876a5b0a35a99473e8-19444
\n", + "
" + ], + "text/plain": [ + " target \\\n", + "0 /home/AMD/jshumway/composable_kernel/build-trace/CMakeFiles/cmake.verify_globs \n", + "1 build.ninja \n", + "2 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... \n", + "3 _deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o \n", + "4 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... \n", + "\n", + " start_ms end_ms cmd_hash worker_id duration_ms \n", + "0 487 511 20b4a9dba5446149 -1 24 \n", + "1 512 159966 80f2b673d9d1b995 -1 159454 \n", + "2 228 9471 b37d14fd75e2d29c -1 9243 \n", + "3 103 9527 330bec9abe978cde -1 9424 \n", + "4 243 9687 6a5b0a35a99473e8 -1 9444 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from trace_analysis import NinjaLogParser\n", "\n",