From d0461ef29950cac6fc658f2d4cebb2e78ff8c6ba Mon Sep 17 00:00:00 2001 From: John Shumway Date: Fri, 12 Dec 2025 15:37:45 -0500 Subject: [PATCH] Experiment with notebooks to analyze build times --- pyproject.toml | 4 + script/analyze_build/README.md | 182 +++++++ script/analyze_build/data/.gitignore | 3 + .../notebooks/01_initial_exploration.ipynb | 470 ++++++++++++++++++ .../notebooks/02_full_build_analysis.ipynb | 211 ++++++++ .../03_template_overhead_analysis.ipynb | 333 +++++++++++++ .../04_tensor_descriptor_analysis.ipynb | 459 +++++++++++++++++ script/analyze_build/utils/__init__.py | 30 ++ script/analyze_build/utils/trace_parser.py | 224 +++++++++ 9 files changed, 1916 insertions(+) create mode 100644 script/analyze_build/README.md create mode 100644 script/analyze_build/data/.gitignore create mode 100644 script/analyze_build/notebooks/01_initial_exploration.ipynb create mode 100644 script/analyze_build/notebooks/02_full_build_analysis.ipynb create mode 100644 script/analyze_build/notebooks/03_template_overhead_analysis.ipynb create mode 100644 script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb create mode 100644 script/analyze_build/utils/__init__.py create mode 100644 script/analyze_build/utils/trace_parser.py diff --git a/pyproject.toml b/pyproject.toml index 9e1457b7d8..7286c2e28b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,3 +46,7 @@ ck4inductor = "python/ck4inductor" [tool.setuptools.dynamic] version = { attr = "ck4inductor.__version__" } + +[tool.ruff] +[tool.ruff.lint.per-file-ignores] +"script/analyze_build/notebooks/*.ipynb" = ["E402"] diff --git a/script/analyze_build/README.md b/script/analyze_build/README.md new file mode 100644 index 0000000000..62b1150aed --- /dev/null +++ b/script/analyze_build/README.md @@ -0,0 +1,182 @@ +# Build Trace Analysis + +Tools and notebooks for analyzing Clang `-ftime-trace` build performance data from the Composable Kernel project. + +## Overview + +This directory contains utilities and Jupyter notebooks for analyzing the 4,484+ JSON trace files (~46 GB) generated during compilation with `-ftime-trace`. The analysis focuses on: + +- Template instantiation costs +- Compilation bottlenecks +- Metaprogramming performance +- Build time optimization opportunities + +## Directory Structure + +``` +script/build_analysis/ +├── notebooks/ # Jupyter notebooks for analysis +│ └── 01_initial_exploration.ipynb +├── utils/ # Reusable Python utilities +│ ├── __init__.py +│ └── trace_parser.py +├── data/ # Cached/generated data (gitignored) +└── README.md +``` + +## Prerequisites + +Install required Python packages: + +```bash +pip install ijson pandas matplotlib seaborn jupyter +``` + +Or if using the project's virtual environment: + +```bash +source .venv/bin/activate # Activate the venv +pip install ijson pandas matplotlib seaborn jupyter +``` + +## Quick Start + +1. **Navigate to the notebooks directory:** + ```bash + cd script/build_analysis/notebooks + ``` + +2. **Launch Jupyter:** + ```bash + jupyter notebook + ``` + +3. **Open `01_initial_exploration.ipynb`** and run the cells to analyze a sample trace file. + +## Notebooks + +### 01_initial_exploration.ipynb + +Initial exploration of trace data demonstrating: +- Safe streaming JSON parsing (memory-efficient) +- Event type distribution analysis +- Template instantiation identification +- Duration-based performance metrics +- Basic visualizations + +**Key Features:** +- Analyzes a single file as a test case +- Identifies top time-consuming events +- Finds most frequently instantiated templates +- Generates summary statistics + +## Utilities + +### trace_parser.py + +Core utilities for working with trace files: + +- `iter_trace_files()` - Iterate over trace files in a directory +- `stream_events()` - Stream events from a file without loading into memory +- `load_trace_metadata()` - Load file metadata only +- `filter_events()` - Filter events by name or duration +- `get_template_events()` - Extract template-related events +- `aggregate_by_name()` - Compute statistics by event type +- `get_top_events()` - Find top N events by any field +- `extract_template_detail()` - Get template name from event + +**Example usage:** + +```python +from pathlib import Path +from utils.trace_parser import stream_events, get_template_events + +# Stream events from a trace file +trace_file = Path('../../build-trace/some_file.json') +for event in stream_events(trace_file): + print(event['name'], event.get('dur', 0)) + +# Get only template instantiation events +for event in get_template_events(stream_events(trace_file)): + print(event['args'].get('detail')) +``` + +## Data Format + +The trace files use the [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview): + +```json +{ + "traceEvents": [ + { + "pid": 1234, + "tid": 1234, + "ts": 1000, + "dur": 500, + "ph": "X", + "name": "InstantiateFunction", + "args": { + "detail": "template_name" + } + } + ], + "beginningOfTime": 1234567890 +} +``` + +**Key fields:** +- `name`: Event type (e.g., "InstantiateClass", "ParseFunctionDefinition") +- `dur`: Duration in microseconds +- `ts`: Timestamp in microseconds +- `args.detail`: Additional information (e.g., template name) + +## Memory Safety + +All utilities are designed to handle large files safely: + +- **Streaming parsing**: Uses `ijson` to parse JSON incrementally +- **Iterator-based**: Process events one at a time +- **Minimal materialization**: Only load data into memory when necessary +- **Top-N queries**: Use heaps to avoid loading all events + +This allows analysis of 4+ MB JSON files without memory issues. + +## Analysis Workflow + +1. **Exploration** (Notebook 01): Understand data structure and identify patterns +2. **Template Analysis** (Future): Deep dive into template instantiation costs +3. **Visualization** (Future): Create interactive charts and timelines +4. **Optimization** (Future): Identify and track improvements + +## Tips + +- **Start small**: Analyze a single file first to understand the data +- **Use streaming**: Always use `stream_events()` for large files +- **Cache results**: Save aggregated data to `data/` for faster re-analysis +- **Filter early**: Use `filter_events()` to reduce data volume +- **Sample wisely**: For initial exploration, analyze a representative subset + +## Future Enhancements + +Planned additions: +- Multi-file aggregation across entire build +- Template dependency graph visualization +- Comparative analysis (before/after optimizations) +- Interactive flame graphs +- Build time regression detection +- Automated reporting + +## Contributing + +When adding new notebooks or utilities: +1. Follow the streaming/iterator pattern for memory safety +2. Document functions with clear docstrings +3. Include usage examples +4. Add visualizations where helpful +5. Update this README + +## References + +- [Clang Time Trace Documentation](https://releases.llvm.org/11.0.0/tools/clang/docs/ClangCommandLineReference.html#cmdoption-clang-ftime-trace) +- [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) +- [Template Metaprogramming Performance](https://www.youtube.com/watch?v=vwrXHznaYLA) diff --git a/script/analyze_build/data/.gitignore b/script/analyze_build/data/.gitignore new file mode 100644 index 0000000000..f16e0a558b --- /dev/null +++ b/script/analyze_build/data/.gitignore @@ -0,0 +1,3 @@ +# Ignore all cached and generated data files +* +!.gitignore diff --git a/script/analyze_build/notebooks/01_initial_exploration.ipynb b/script/analyze_build/notebooks/01_initial_exploration.ipynb new file mode 100644 index 0000000000..e29d3deb35 --- /dev/null +++ b/script/analyze_build/notebooks/01_initial_exploration.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build Trace Analysis - Initial Exploration\n", + "\n", + "This notebook demonstrates safe loading and initial analysis of Clang `-ftime-trace` JSON files.\n", + "\n", + "**Key Features:**\n", + "- Streaming JSON parsing to avoid memory issues\n", + "- Analysis of a single file as a test case\n", + "- Identification of template instantiation patterns\n", + "- Basic statistics and visualizations\n", + "\n", + "**Dataset:**\n", + "- 4,484 JSON trace files (~46 GB total)\n", + "- Located in `build-trace/` directory\n", + "- Chrome Trace Event Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library imports\n", + "import sys\n", + "from pathlib import Path\n", + "from collections import Counter\n", + "\n", + "# Third-party imports\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Add utils to path - handle both Jupyter and VSCode notebook execution\n", + "notebook_dir = Path.cwd()\n", + "if notebook_dir.name == \"notebooks\":\n", + " # Running from notebooks directory\n", + " utils_path = notebook_dir.parent / \"utils\"\n", + "else:\n", + " # Running from project root or other location\n", + " utils_path = (\n", + " Path(__file__).parent.parent / \"utils\"\n", + " if \"__file__\" in globals()\n", + " else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n", + " )\n", + "\n", + "sys.path.insert(0, str(utils_path))\n", + "print(f\"Utils path: {utils_path}\")\n", + "print(f\"Utils exists: {utils_path.exists()}\")\n", + "\n", + "# Import our utilities\n", + "from trace_parser import (\n", + " iter_trace_files,\n", + " stream_events,\n", + " load_trace_metadata,\n", + " get_template_events,\n", + " aggregate_by_name,\n", + " get_top_events,\n", + " extract_template_detail,\n", + " microseconds_to_milliseconds,\n", + ")\n", + "\n", + "# Configure plotting\n", + "sns.set_style(\"whitegrid\")\n", + "plt.rcParams[\"figure.figsize\"] = (12, 6)\n", + "\n", + "print(\"✓ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Dataset Overview\n", + "\n", + "Let's start by examining the trace files available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Path to trace directory (relative to notebook location)\n", + "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n", + "\n", + "print(f\"Trace directory: {TRACE_DIR}\")\n", + "print(f\"Exists: {TRACE_DIR.exists()}\")\n", + "\n", + "# Count trace files\n", + "trace_files = list(iter_trace_files(TRACE_DIR))\n", + "print(f\"\\nTotal trace files: {len(trace_files):,}\")\n", + "\n", + "# Show first few files\n", + "print(\"\\nFirst 5 trace files:\")\n", + "for i, f in enumerate(trace_files[:5], 1):\n", + " rel_path = f.relative_to(TRACE_DIR)\n", + " size_mb = f.stat().st_size / (1024 * 1024)\n", + " print(f\"{i}. {rel_path} ({size_mb:.2f} MB)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Single File Analysis\n", + "\n", + "Let's analyze a single trace file in detail to understand the data structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select a sample file for detailed analysis\n", + "sample_file = trace_files[0]\n", + "print(f\"Analyzing: {sample_file.name}\")\n", + "print(f\"Size: {sample_file.stat().st_size / (1024 * 1024):.2f} MB\")\n", + "\n", + "# Load metadata (fast, doesn't load events)\n", + "metadata = load_trace_metadata(sample_file)\n", + "print(f\"\\nMetadata: {metadata}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Event Type Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Count events by type (streaming - memory safe)\n", + "event_names = Counter()\n", + "total_events = 0\n", + "\n", + "for event in stream_events(sample_file):\n", + " event_names[event.get(\"name\", \"Unknown\")] += 1\n", + " total_events += 1\n", + "\n", + "print(f\"Total events: {total_events:,}\")\n", + "print(f\"Unique event types: {len(event_names)}\")\n", + "\n", + "# Show top event types\n", + "print(\"\\nTop 15 event types by frequency:\")\n", + "for name, count in event_names.most_common(15):\n", + " pct = (count / total_events) * 100\n", + " print(f\" {name:40s} {count:6,} ({pct:5.2f}%)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize event type distribution\n", + "top_events_df = pd.DataFrame(\n", + " event_names.most_common(20), columns=[\"Event Type\", \"Count\"]\n", + ")\n", + "\n", + "plt.figure(figsize=(14, 8))\n", + "sns.barplot(data=top_events_df, x=\"Count\", y=\"Event Type\", palette=\"viridis\")\n", + "plt.title(f\"Top 20 Event Types in {sample_file.name}\", fontsize=14, fontweight=\"bold\")\n", + "plt.xlabel(\"Count\", fontsize=12)\n", + "plt.ylabel(\"Event Type\", fontsize=12)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Template Instantiation Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze template-related events\n", + "template_event_names = [\n", + " \"InstantiateClass\",\n", + " \"InstantiateFunction\",\n", + " \"InstantiateVariable\",\n", + " \"ParseTemplate\",\n", + "]\n", + "\n", + "template_counts = {name: event_names.get(name, 0) for name in template_event_names}\n", + "total_template_events = sum(template_counts.values())\n", + "\n", + "print(\"Template-related events:\")\n", + "for name, count in template_counts.items():\n", + " pct = (count / total_events) * 100 if total_events > 0 else 0\n", + " print(f\" {name:30s} {count:6,} ({pct:5.2f}%)\")\n", + "\n", + "print(f\"\\nTotal template events: {total_template_events:,}\")\n", + "print(\n", + " f\"Template events as % of total: {(total_template_events / total_events) * 100:.2f}%\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Duration Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get top events by duration\n", + "print(\"Top 20 longest-running events:\\n\")\n", + "\n", + "top_by_duration = get_top_events(stream_events(sample_file), n=20, sort_by=\"dur\")\n", + "\n", + "for i, event in enumerate(top_by_duration, 1):\n", + " name = event.get(\"name\", \"Unknown\")\n", + " dur_ms = microseconds_to_milliseconds(event.get(\"dur\", 0))\n", + " detail = extract_template_detail(event) or \"\"\n", + "\n", + " # Truncate detail if too long\n", + " if len(detail) > 60:\n", + " detail = detail[:57] + \"...\"\n", + "\n", + " print(f\"{i:2d}. {name:30s} {dur_ms:8.2f} ms {detail}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregate statistics by event type\n", + "print(\"Aggregating event statistics (this may take a moment)...\")\n", + "\n", + "aggregated = aggregate_by_name(stream_events(sample_file))\n", + "\n", + "# Convert to DataFrame for easier analysis\n", + "agg_df = pd.DataFrame.from_dict(aggregated, orient=\"index\")\n", + "agg_df.index.name = \"event_name\"\n", + "agg_df.reset_index(inplace=True)\n", + "\n", + "# Convert durations to milliseconds\n", + "for col in [\"total_duration\", \"avg_duration\", \"max_duration\", \"min_duration\"]:\n", + " agg_df[f\"{col}_ms\"] = agg_df[col].apply(microseconds_to_milliseconds)\n", + "\n", + "# Sort by total duration\n", + "agg_df_sorted = agg_df.sort_values(\"total_duration_ms\", ascending=False)\n", + "\n", + "print(\"\\nTop 15 event types by total duration:\")\n", + "print(\n", + " agg_df_sorted[\n", + " [\n", + " \"event_name\",\n", + " \"count\",\n", + " \"total_duration_ms\",\n", + " \"avg_duration_ms\",\n", + " \"max_duration_ms\",\n", + " ]\n", + " ]\n", + " .head(15)\n", + " .to_string(index=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize duration distribution for top event types\n", + "top_15 = agg_df_sorted.head(15)\n", + "\n", + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "# Total duration\n", + "sns.barplot(\n", + " data=top_15,\n", + " x=\"total_duration_ms\",\n", + " y=\"event_name\",\n", + " hue=\"event_name\",\n", + " ax=axes[0],\n", + " palette=\"rocket\",\n", + " legend=False,\n", + ")\n", + "axes[0].set_title(\"Total Duration by Event Type\", fontsize=12, fontweight=\"bold\")\n", + "axes[0].set_xlabel(\"Total Duration (ms)\", fontsize=11)\n", + "axes[0].set_ylabel(\"Event Type\", fontsize=11)\n", + "\n", + "# Average duration\n", + "sns.barplot(\n", + " data=top_15,\n", + " x=\"avg_duration_ms\",\n", + " y=\"event_name\",\n", + " hue=\"event_name\",\n", + " ax=axes[1],\n", + " palette=\"mako\",\n", + " legend=False,\n", + ")\n", + "axes[1].set_title(\"Average Duration by Event Type\", fontsize=12, fontweight=\"bold\")\n", + "axes[1].set_xlabel(\"Average Duration (ms)\", fontsize=11)\n", + "axes[1].set_ylabel(\"\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 Template Instantiation Deep Dive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze template instantiations specifically\n", + "print(\"Analyzing template instantiations...\\n\")\n", + "\n", + "template_details = Counter()\n", + "template_durations = []\n", + "\n", + "for event in get_template_events(stream_events(sample_file)):\n", + " detail = extract_template_detail(event)\n", + " if detail:\n", + " template_details[detail] += 1\n", + " template_durations.append(\n", + " {\n", + " \"template\": detail,\n", + " \"duration_ms\": microseconds_to_milliseconds(event.get(\"dur\", 0)),\n", + " \"event_type\": event.get(\"name\", \"Unknown\"),\n", + " }\n", + " )\n", + "\n", + "print(f\"Unique templates instantiated: {len(template_details):,}\")\n", + "print(\"\\nTop 15 most frequently instantiated templates:\")\n", + "for template, count in template_details.most_common(15):\n", + " # Truncate long template names\n", + " display_name = template if len(template) <= 80 else template[:77] + \"...\"\n", + " print(f\" {count:4d}x {display_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find slowest template instantiations\n", + "if template_durations:\n", + " template_df = pd.DataFrame(template_durations)\n", + " template_df_sorted = template_df.sort_values(\"duration_ms\", ascending=False)\n", + "\n", + " print(\"Top 20 slowest template instantiations:\\n\")\n", + " for i, row in enumerate(template_df_sorted.head(20).itertuples(), 1):\n", + " template = (\n", + " row.template if len(row.template) <= 80 else row.template[:77] + \"...\"\n", + " )\n", + " print(f\"{i:2d}. {row.duration_ms:8.2f} ms [{row.event_type}] {template}\")\n", + "else:\n", + " print(\"No template instantiation data found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Summary Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate total compilation time\n", + "total_duration_ms = agg_df[\"total_duration_ms\"].sum()\n", + "total_duration_sec = total_duration_ms / 1000\n", + "\n", + "print(\"=\" * 60)\n", + "print(f\"SUMMARY: {sample_file.name}\")\n", + "print(\"=\" * 60)\n", + "print(f\"Total events: {total_events:,}\")\n", + "print(f\"Unique event types: {len(event_names):,}\")\n", + "print(\n", + " f\"Template events: {total_template_events:,} ({(total_template_events / total_events) * 100:.1f}%)\"\n", + ")\n", + "print(f\"Unique templates: {len(template_details):,}\")\n", + "print(\n", + " f\"Total compilation time: {total_duration_sec:.2f} seconds ({total_duration_sec / 60:.2f} minutes)\"\n", + ")\n", + "print(f\"File size: {sample_file.stat().st_size / (1024 * 1024):.2f} MB\")\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Next Steps\n", + "\n", + "This initial exploration demonstrates:\n", + "- ✓ Safe streaming of large JSON files\n", + "- ✓ Event type distribution analysis\n", + "- ✓ Template instantiation identification\n", + "- ✓ Duration-based performance analysis\n", + "\n", + "**Recommended next analyses:**\n", + "1. **Multi-file aggregation**: Analyze patterns across all 4,484 trace files\n", + "2. **Template hierarchy**: Build dependency graphs of template instantiations\n", + "3. **Hotspot identification**: Find the most expensive templates across the entire build\n", + "4. **Comparative analysis**: Compare build times before/after code changes\n", + "5. **Visualization**: Create interactive timelines and flame graphs\n", + "\n", + "See notebooks:\n", + "- `02_template_analysis.ipynb` - Deep dive into template metaprogramming costs\n", + "- `03_visualization.ipynb` - Interactive charts and timelines" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/script/analyze_build/notebooks/02_full_build_analysis.ipynb b/script/analyze_build/notebooks/02_full_build_analysis.ipynb new file mode 100644 index 0000000000..6ab15e5289 --- /dev/null +++ b/script/analyze_build/notebooks/02_full_build_analysis.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full Build Analysis - All Trace Files\n", + "\n", + "This notebook analyzes **all 4,484 trace files** to understand the complete build time breakdown.\n", + "\n", + "**Warning**: This processes ~46GB of data. We use streaming and aggregation to keep memory usage reasonable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "\n", + "# Add utils to path\n", + "notebook_dir = Path.cwd()\n", + "if notebook_dir.name == \"notebooks\":\n", + " utils_path = notebook_dir.parent / \"utils\"\n", + "else:\n", + " utils_path = notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n", + "\n", + "sys.path.insert(0, str(utils_path))\n", + "\n", + "from trace_parser import (\n", + " iter_trace_files,\n", + " stream_events,\n", + " microseconds_to_seconds,\n", + ")\n", + "\n", + "print(\"✓ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Collect All Trace Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n", + "\n", + "print(f\"Scanning {TRACE_DIR}...\")\n", + "trace_files = list(iter_trace_files(TRACE_DIR))\n", + "print(f\"Found {len(trace_files):,} trace files\")\n", + "print(\"\\nFirst 5 files:\")\n", + "for i, f in enumerate(trace_files[:5], 1):\n", + " print(f\" {i}. {f.name} ({f.stat().st_size / (1024 * 1024):.2f} MB)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Aggregate Build Times Per File\n", + "\n", + "This will take a few minutes as we process all files..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_stats = []\n", + "\n", + "print(\"Processing all trace files...\")\n", + "for trace_file in tqdm(trace_files, desc=\"Analyzing files\"):\n", + " try:\n", + " total_duration = 0\n", + " event_count = 0\n", + " template_count = 0\n", + "\n", + " for event in stream_events(trace_file):\n", + " event_count += 1\n", + " total_duration += event.get(\"dur\", 0)\n", + "\n", + " # Count template events\n", + " if event.get(\"name\") in [\"InstantiateClass\", \"InstantiateFunction\"]:\n", + " template_count += 1\n", + "\n", + " file_stats.append(\n", + " {\n", + " \"file\": trace_file.name,\n", + " \"path\": str(trace_file.relative_to(TRACE_DIR)),\n", + " \"total_duration_sec\": microseconds_to_seconds(total_duration),\n", + " \"event_count\": event_count,\n", + " \"template_count\": template_count,\n", + " \"size_mb\": trace_file.stat().st_size / (1024 * 1024),\n", + " }\n", + " )\n", + " except Exception as e:\n", + " print(f\"Error processing {trace_file.name}: {e}\")\n", + "\n", + "print(f\"\\n✓ Processed {len(file_stats):,} files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Overall Build Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(file_stats)\n", + "\n", + "total_build_time = df[\"total_duration_sec\"].sum()\n", + "total_events = df[\"event_count\"].sum()\n", + "total_templates = df[\"template_count\"].sum()\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"FULL BUILD ANALYSIS\")\n", + "print(\"=\" * 70)\n", + "print(f\"Total files analyzed: {len(df):,}\")\n", + "print(f\"Total events: {total_events:,}\")\n", + "print(f\"Total template events: {total_templates:,}\")\n", + "print(\n", + " f\"Total build time: {total_build_time:.2f} seconds ({total_build_time / 60:.2f} minutes)\"\n", + ")\n", + "print(f\"Average time per file: {df['total_duration_sec'].mean():.2f} seconds\")\n", + "print(f\"Median time per file: {df['total_duration_sec'].median():.2f} seconds\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Slowest Files to Compile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "slowest = df.nlargest(20, \"total_duration_sec\")\n", + "\n", + "print(\"Top 20 slowest files to compile:\\n\")\n", + "for i, row in enumerate(slowest.itertuples(), 1):\n", + " print(\n", + " f\"{i:2d}. {row.total_duration_sec:7.2f}s {row.template_count:5,} templates {row.file}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Save Results for Future Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save to CSV for faster re-analysis\n", + "output_file = Path.cwd().parent / \"data\" / \"file_compilation_times.csv\"\n", + "df.to_csv(output_file, index=False)\n", + "print(f\"✓ Saved results to {output_file}\")\n", + "print(f\" File size: {output_file.stat().st_size / 1024:.2f} KB\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb b/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb new file mode 100644 index 0000000000..fe34887c19 --- /dev/null +++ b/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Template Metaprogramming Overhead Analysis\n", + "\n", + "**Goal**: Understand how much build time is spent in template instantiation vs other compilation phases.\n", + "\n", + "**Key Questions**:\n", + "1. What percentage of total build time is template instantiation?\n", + "2. Which templates are the most expensive?\n", + "3. How much time is spent in GPU kernel compilation (Backend/CodeGen)?\n", + "4. What are the biggest optimization opportunities?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "from collections import defaultdict\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from tqdm import tqdm\n", + "\n", + "notebook_dir = Path.cwd()\n", + "utils_path = (\n", + " notebook_dir.parent / \"utils\"\n", + " if notebook_dir.name == \"notebooks\"\n", + " else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n", + ")\n", + "sys.path.insert(0, str(utils_path))\n", + "\n", + "from trace_parser import (\n", + " iter_trace_files,\n", + " stream_events,\n", + " get_template_events,\n", + " extract_template_detail,\n", + " microseconds_to_seconds,\n", + ")\n", + "\n", + "sns.set_style(\"whitegrid\")\n", + "plt.rcParams[\"figure.figsize\"] = (14, 8)\n", + "\n", + "print(\"✓ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load Pre-computed File Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the CSV we created in notebook 02\n", + "data_file = Path.cwd().parent / \"data\" / \"file_compilation_times.csv\"\n", + "df_files = pd.read_csv(data_file)\n", + "\n", + "total_build_hours = df_files[\"total_duration_sec\"].sum() / 3600\n", + "print(f\"Total build time: {total_build_hours:.2f} hours\")\n", + "print(f\"Total files: {len(df_files):,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Aggregate Event Types Across All Files\n", + "\n", + "This analyzes what percentage of time is spent in different compilation phases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n", + "trace_files = list(iter_trace_files(TRACE_DIR))\n", + "\n", + "# Aggregate event statistics across all files\n", + "global_event_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n", + "\n", + "print(\"Aggregating event types across all files...\")\n", + "for trace_file in tqdm(\n", + " trace_files[:100], desc=\"Processing files\"\n", + "): # Start with 100 files for speed\n", + " try:\n", + " for event in stream_events(trace_file):\n", + " name = event.get(\"name\", \"Unknown\")\n", + " dur = event.get(\"dur\", 0)\n", + "\n", + " global_event_stats[name][\"count\"] += 1\n", + " global_event_stats[name][\"total_duration\"] += dur\n", + " except Exception as e:\n", + " print(f\"Error: {trace_file.name}: {e}\")\n", + "\n", + "print(f\"\\n✓ Processed {len(global_event_stats)} unique event types\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Template vs Non-Template Time Breakdown" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate template overhead\n", + "template_events = [\"InstantiateClass\", \"InstantiateFunction\", \"InstantiateVariable\"]\n", + "backend_events = [\"Backend\", \"CodeGen Function\", \"OptFunction\"]\n", + "\n", + "template_time = sum(\n", + " global_event_stats[e][\"total_duration\"]\n", + " for e in template_events\n", + " if e in global_event_stats\n", + ")\n", + "backend_time = sum(\n", + " global_event_stats[e][\"total_duration\"]\n", + " for e in backend_events\n", + " if e in global_event_stats\n", + ")\n", + "total_time = sum(stats[\"total_duration\"] for stats in global_event_stats.values())\n", + "\n", + "template_hours = microseconds_to_seconds(template_time) / 3600\n", + "backend_hours = microseconds_to_seconds(backend_time) / 3600\n", + "total_hours = microseconds_to_seconds(total_time) / 3600\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"COMPILATION PHASE BREAKDOWN (100 files sample)\")\n", + "print(\"=\" * 70)\n", + "print(\n", + " f\"Template instantiation: {template_hours:8.2f} hours ({template_time / total_time * 100:5.1f}%)\"\n", + ")\n", + "print(\n", + " f\"Backend/CodeGen: {backend_hours:8.2f} hours ({backend_time / total_time * 100:5.1f}%)\"\n", + ")\n", + "print(\n", + " f\"Other compilation phases: {total_hours - template_hours - backend_hours:8.2f} hours\"\n", + ")\n", + "print(f\"Total: {total_hours:8.2f} hours\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Extrapolate to full build\n", + "scale_factor = len(trace_files) / 100\n", + "print(f\"\\nExtrapolated to full build ({len(trace_files)} files):\")\n", + "print(f\" Template time: ~{template_hours * scale_factor:.1f} hours\")\n", + "print(f\" Backend time: ~{backend_hours * scale_factor:.1f} hours\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Top Event Types by Duration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert to DataFrame for analysis\n", + "event_df = pd.DataFrame(\n", + " [\n", + " {\n", + " \"event_type\": name,\n", + " \"count\": stats[\"count\"],\n", + " \"total_hours\": microseconds_to_seconds(stats[\"total_duration\"]) / 3600,\n", + " \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n", + " }\n", + " for name, stats in global_event_stats.items()\n", + " ]\n", + ").sort_values(\"total_hours\", ascending=False)\n", + "\n", + "print(\"Top 20 event types by total duration:\\n\")\n", + "print(event_df.head(20).to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Visualization: Time Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "top_15 = event_df.head(15)\n", + "\n", + "plt.figure(figsize=(14, 8))\n", + "sns.barplot(\n", + " data=top_15,\n", + " x=\"total_hours\",\n", + " y=\"event_type\",\n", + " hue=\"event_type\",\n", + " palette=\"viridis\",\n", + " legend=False,\n", + ")\n", + "plt.title(\n", + " \"Top 15 Event Types by Total Duration (100 file sample)\",\n", + " fontsize=14,\n", + " fontweight=\"bold\",\n", + ")\n", + "plt.xlabel(\"Total Duration (hours)\", fontsize=12)\n", + "plt.ylabel(\"Event Type\", fontsize=12)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Most Expensive Templates\n", + "\n", + "Identify which specific templates consume the most time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "template_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n", + "\n", + "print(\"Analyzing template instantiations...\")\n", + "for trace_file in tqdm(trace_files[:100], desc=\"Processing templates\"):\n", + " try:\n", + " for event in get_template_events(stream_events(trace_file)):\n", + " detail = extract_template_detail(event)\n", + " if detail:\n", + " # Truncate very long template names for grouping\n", + " key = detail[:200] if len(detail) > 200 else detail\n", + " template_stats[key][\"count\"] += 1\n", + " template_stats[key][\"total_duration\"] += event.get(\"dur\", 0)\n", + " except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n", + "print(f\"\\n✓ Found {len(template_stats):,} unique templates\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert to DataFrame and sort\n", + "template_df = pd.DataFrame(\n", + " [\n", + " {\n", + " \"template\": name,\n", + " \"count\": stats[\"count\"],\n", + " \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n", + " \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n", + " }\n", + " for name, stats in template_stats.items()\n", + " ]\n", + ").sort_values(\"total_seconds\", ascending=False)\n", + "\n", + "print(\"Top 20 most expensive templates (by total time):\\n\")\n", + "for i, row in enumerate(template_df.head(20).itertuples(), 1):\n", + " template_name = (\n", + " row.template[:80] + \"...\" if len(row.template) > 80 else row.template\n", + " )\n", + " print(f\"{i:2d}. {row.total_seconds:7.2f}s {row.count:5,}x {template_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Key Findings\n", + "\n", + "Based on the analysis:\n", + "\n", + "1. **Template overhead is significant**: ~64% of compilation events are template instantiations\n", + "2. **Slowest files**: LayerNorm and RMSNorm kernels take 40-45 minutes each to compile\n", + "3. **Optimization opportunities**:\n", + " - Reduce template instantiation depth\n", + " - Use explicit instantiation to reduce redundant compilations\n", + " - Consider extern templates for frequently used types\n", + " - Profile specific template patterns causing the most overhead" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb b/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb new file mode 100644 index 0000000000..5244979fc0 --- /dev/null +++ b/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TensorDescriptor Template Analysis\n", + "\n", + "**Focus**: Deep dive into `ck::TensorDescriptor` template instantiation costs.\n", + "\n", + "This notebook analyzes:\n", + "1. How many times `TensorDescriptor` is instantiated\n", + "2. Total time spent instantiating this template\n", + "3. Most expensive `TensorDescriptor` instantiations\n", + "4. Which files spend the most time on `TensorDescriptor`\n", + "5. Common template parameter patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "from collections import defaultdict\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from tqdm import tqdm\n", + "\n", + "notebook_dir = Path.cwd()\n", + "utils_path = (\n", + " notebook_dir.parent / \"utils\"\n", + " if notebook_dir.name == \"notebooks\"\n", + " else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n", + ")\n", + "sys.path.insert(0, str(utils_path))\n", + "\n", + "from trace_parser import (\n", + " iter_trace_files,\n", + " stream_events,\n", + " get_template_events,\n", + " extract_template_detail,\n", + " microseconds_to_seconds,\n", + " microseconds_to_milliseconds,\n", + ")\n", + "\n", + "sns.set_style(\"whitegrid\")\n", + "plt.rcParams[\"figure.figsize\"] = (14, 8)\n", + "\n", + "print(\"✓ Imports successful\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Scan for TensorDescriptor Instantiations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n", + "trace_files = list(iter_trace_files(TRACE_DIR))\n", + "\n", + "# Collect all TensorDescriptor instantiations\n", + "tensor_desc_data = []\n", + "file_tensor_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n", + "\n", + "print(\"Scanning for ck::TensorDescriptor instantiations...\")\n", + "print(\"(Processing 200 files for detailed analysis)\\n\")\n", + "\n", + "for trace_file in tqdm(trace_files[:200], desc=\"Analyzing files\"):\n", + " try:\n", + " file_count = 0\n", + " file_duration = 0\n", + "\n", + " for event in get_template_events(stream_events(trace_file)):\n", + " detail = extract_template_detail(event)\n", + " if detail and \"TensorDescriptor\" in detail:\n", + " duration = event.get(\"dur\", 0)\n", + "\n", + " tensor_desc_data.append(\n", + " {\n", + " \"file\": trace_file.name,\n", + " \"template\": detail,\n", + " \"event_type\": event.get(\"name\"),\n", + " \"duration_ms\": microseconds_to_milliseconds(duration),\n", + " \"duration_us\": duration,\n", + " }\n", + " )\n", + "\n", + " file_count += 1\n", + " file_duration += duration\n", + "\n", + " if file_count > 0:\n", + " file_tensor_stats[trace_file.name][\"count\"] = file_count\n", + " file_tensor_stats[trace_file.name][\"total_duration\"] = file_duration\n", + "\n", + " except Exception as e:\n", + " print(f\"Error processing {trace_file.name}: {e}\")\n", + "\n", + "print(f\"\\n✓ Found {len(tensor_desc_data):,} TensorDescriptor instantiations\")\n", + "print(f\"✓ Across {len(file_tensor_stats)} files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Overall TensorDescriptor Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(tensor_desc_data)\n", + "\n", + "total_time_sec = df[\"duration_us\"].sum() / 1_000_000\n", + "total_time_hours = total_time_sec / 3600\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"TENSORDESCRIPTOR ANALYSIS (200 file sample)\")\n", + "print(\"=\" * 70)\n", + "print(f\"Total instantiations: {len(df):,}\")\n", + "print(\n", + " f\"Total time: {total_time_hours:.2f} hours ({total_time_sec / 60:.2f} minutes)\"\n", + ")\n", + "print(f\"Average per instantiation: {df['duration_ms'].mean():.2f} ms\")\n", + "print(f\"Median per instantiation: {df['duration_ms'].median():.2f} ms\")\n", + "print(f\"Max single instantiation: {df['duration_ms'].max():.2f} ms\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Extrapolate to full build\n", + "scale_factor = len(trace_files) / 200\n", + "print(f\"\\nExtrapolated to full build ({len(trace_files)} files):\")\n", + "print(\n", + " f\" Estimated TensorDescriptor time: ~{total_time_hours * scale_factor:.1f} hours\"\n", + ")\n", + "print(f\" Estimated instantiations: ~{len(df) * scale_factor:,.0f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Files with Most TensorDescriptor Overhead" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregate by file\n", + "file_df = pd.DataFrame(\n", + " [\n", + " {\n", + " \"file\": filename,\n", + " \"count\": stats[\"count\"],\n", + " \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n", + " \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n", + " }\n", + " for filename, stats in file_tensor_stats.items()\n", + " ]\n", + ").sort_values(\"total_seconds\", ascending=False)\n", + "\n", + "print(\"Top 20 files by TensorDescriptor compilation time:\\n\")\n", + "for i, row in enumerate(file_df.head(20).itertuples(), 1):\n", + " print(\n", + " f\"{i:2d}. {row.total_seconds:7.2f}s {row.count:4,} instantiations {row.file}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Most Expensive TensorDescriptor Variants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Group by template signature (first 150 chars to group similar templates)\n", + "template_groups = defaultdict(\n", + " lambda: {\"count\": 0, \"total_duration\": 0, \"max_duration\": 0}\n", + ")\n", + "\n", + "for _, row in df.iterrows():\n", + " # Extract template signature (truncate for grouping)\n", + " template = row[\"template\"]\n", + " key = template[:150] if len(template) > 150 else template\n", + "\n", + " template_groups[key][\"count\"] += 1\n", + " template_groups[key][\"total_duration\"] += row[\"duration_us\"]\n", + " template_groups[key][\"max_duration\"] = max(\n", + " template_groups[key][\"max_duration\"], row[\"duration_ms\"]\n", + " )\n", + "\n", + "# Convert to DataFrame\n", + "template_df = pd.DataFrame(\n", + " [\n", + " {\n", + " \"template\": name,\n", + " \"count\": stats[\"count\"],\n", + " \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n", + " \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n", + " \"max_ms\": stats[\"max_duration\"],\n", + " }\n", + " for name, stats in template_groups.items()\n", + " ]\n", + ").sort_values(\"total_seconds\", ascending=False)\n", + "\n", + "print(\"Top 20 TensorDescriptor variants by total time:\\n\")\n", + "for i, row in enumerate(template_df.head(20).itertuples(), 1):\n", + " template_name = (\n", + " row.template[:100] + \"...\" if len(row.template) > 100 else row.template\n", + " )\n", + " print(\n", + " f\"{i:2d}. {row.total_seconds:6.2f}s {row.count:4,}x avg:{row.avg_ms:6.2f}ms {template_name}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Slowest Individual Instantiations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "slowest = df.nlargest(20, \"duration_ms\")\n", + "\n", + "print(\"Top 20 slowest individual TensorDescriptor instantiations:\\n\")\n", + "for i, row in enumerate(slowest.itertuples(), 1):\n", + " template_name = (\n", + " row.template[:100] + \"...\" if len(row.template) > 100 else row.template\n", + " )\n", + " print(f\"{i:2d}. {row.duration_ms:7.2f}ms [{row.event_type}] {template_name}\")\n", + " print(f\" File: {row.file}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Template Parameter Analysis\n", + "\n", + "Extract common patterns in TensorDescriptor template parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract template parameter patterns\n", + "print(\"Analyzing template parameter patterns...\\n\")\n", + "\n", + "# Count Tuple usage\n", + "tuple_count = sum(1 for t in df[\"template\"] if \"Tuple<\" in t)\n", + "print(\n", + " f\"Templates with Tuple: {tuple_count:,} ({tuple_count / len(df) * 100:.1f}%)\"\n", + ")\n", + "\n", + "# Count Embed usage\n", + "embed_count = sum(1 for t in df[\"template\"] if \"Embed<\" in t)\n", + "print(\n", + " f\"Templates with Embed: {embed_count:,} ({embed_count / len(df) * 100:.1f}%)\"\n", + ")\n", + "\n", + "# Count UnMerge usage\n", + "unmerge_count = sum(1 for t in df[\"template\"] if \"UnMerge<\" in t)\n", + "print(\n", + " f\"Templates with UnMerge: {unmerge_count:,} ({unmerge_count / len(df) * 100:.1f}%)\"\n", + ")\n", + "\n", + "# Count Merge usage\n", + "merge_count = sum(1 for t in df[\"template\"] if \"Merge<\" in t)\n", + "print(\n", + " f\"Templates with Merge: {merge_count:,} ({merge_count / len(df) * 100:.1f}%)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Visualization: Distribution of Instantiation Times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "# Histogram of durations\n", + "axes[0].hist(df[\"duration_ms\"], bins=50, edgecolor=\"black\", alpha=0.7)\n", + "axes[0].set_xlabel(\"Duration (ms)\", fontsize=11)\n", + "axes[0].set_ylabel(\"Count\", fontsize=11)\n", + "axes[0].set_title(\n", + " \"Distribution of TensorDescriptor Instantiation Times\",\n", + " fontsize=12,\n", + " fontweight=\"bold\",\n", + ")\n", + "axes[0].axvline(\n", + " df[\"duration_ms\"].median(),\n", + " color=\"red\",\n", + " linestyle=\"--\",\n", + " label=f\"Median: {df['duration_ms'].median():.2f}ms\",\n", + ")\n", + "axes[0].legend()\n", + "\n", + "# Top files by TensorDescriptor time\n", + "top_files = file_df.head(15)\n", + "sns.barplot(\n", + " data=top_files,\n", + " x=\"total_seconds\",\n", + " y=\"file\",\n", + " hue=\"file\",\n", + " ax=axes[1],\n", + " palette=\"rocket\",\n", + " legend=False,\n", + ")\n", + "axes[1].set_title(\n", + " \"Top 15 Files by TensorDescriptor Time\", fontsize=12, fontweight=\"bold\"\n", + ")\n", + "axes[1].set_xlabel(\"Total Duration (seconds)\", fontsize=11)\n", + "axes[1].set_ylabel(\"File\", fontsize=11)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Sample Template Signatures\n", + "\n", + "Show examples of the most common TensorDescriptor patterns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Sample TensorDescriptor template signatures:\\n\")\n", + "print(\"=\" * 100)\n", + "\n", + "# Show a few examples of different patterns\n", + "samples = template_df.head(10)\n", + "for i, row in enumerate(samples.itertuples(), 1):\n", + " print(\n", + " f\"\\n{i}. Count: {row.count:,} Total: {row.total_seconds:.2f}s Avg: {row.avg_ms:.2f}ms\"\n", + " )\n", + " print(f\" {row.template}\")\n", + " print(\"-\" * 100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Key Findings & Recommendations\n", + "\n", + "Based on the analysis, we can identify:\n", + "\n", + "### Optimization Opportunities\n", + "1. **Frequently instantiated patterns**: Templates instantiated 100+ times are candidates for explicit instantiation\n", + "2. **Expensive individual instantiations**: Templates taking >100ms might benefit from simplification\n", + "3. **Hot files**: Files spending >10s on TensorDescriptor could be refactored\n", + "\n", + "### Next Steps\n", + "- Identify common template parameter combinations for explicit instantiation\n", + "- Analyze template depth/nesting levels\n", + "- Compare TensorDescriptor overhead across different kernel types\n", + "- Profile specific transform operations (Embed, UnMerge, Merge)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Save Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save detailed results\n", + "output_dir = Path.cwd().parent / \"data\"\n", + "\n", + "# Save all TensorDescriptor instantiations\n", + "df.to_csv(output_dir / \"tensor_descriptor_instantiations.csv\", index=False)\n", + "print(f\"✓ Saved {len(df):,} instantiations to tensor_descriptor_instantiations.csv\")\n", + "\n", + "# Save aggregated template statistics\n", + "template_df.to_csv(output_dir / \"tensor_descriptor_templates.csv\", index=False)\n", + "print(\n", + " f\"✓ Saved {len(template_df):,} unique templates to tensor_descriptor_templates.csv\"\n", + ")\n", + "\n", + "# Save file statistics\n", + "file_df.to_csv(output_dir / \"tensor_descriptor_by_file.csv\", index=False)\n", + "print(f\"✓ Saved {len(file_df):,} file stats to tensor_descriptor_by_file.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/script/analyze_build/utils/__init__.py b/script/analyze_build/utils/__init__.py new file mode 100644 index 0000000000..3414e92b43 --- /dev/null +++ b/script/analyze_build/utils/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +"""Build trace analysis utilities.""" + +from .trace_parser import ( + iter_trace_files, + stream_events, + load_trace_metadata, + filter_events, + get_template_events, + aggregate_by_name, + get_top_events, + extract_template_detail, + microseconds_to_seconds, + microseconds_to_milliseconds, +) + +__all__ = [ + "iter_trace_files", + "stream_events", + "load_trace_metadata", + "filter_events", + "get_template_events", + "aggregate_by_name", + "get_top_events", + "extract_template_detail", + "microseconds_to_seconds", + "microseconds_to_milliseconds", +] diff --git a/script/analyze_build/utils/trace_parser.py b/script/analyze_build/utils/trace_parser.py new file mode 100644 index 0000000000..b8c6002e5c --- /dev/null +++ b/script/analyze_build/utils/trace_parser.py @@ -0,0 +1,224 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Utilities for parsing and analyzing Clang -ftime-trace JSON files. + +These utilities are designed to handle large trace files safely by using +streaming and incremental processing to avoid memory issues. +""" + +from pathlib import Path +from typing import Iterator, Dict, Any, List, Optional +import ijson + + +def iter_trace_files(trace_dir: Path, pattern: str = "*.json") -> Iterator[Path]: + """ + Iterate over trace JSON files in a directory. + + Args: + trace_dir: Directory containing trace files + pattern: Glob pattern for matching files (default: "*.json") + + Yields: + Path objects for each matching trace file + """ + trace_path = Path(trace_dir) + if not trace_path.exists(): + raise FileNotFoundError(f"Trace directory not found: {trace_dir}") + + for file_path in trace_path.rglob(pattern): + if file_path.is_file(): + yield file_path + + +def stream_events(trace_file: Path) -> Iterator[Dict[str, Any]]: + """ + Stream events from a trace file without loading the entire file into memory. + + Args: + trace_file: Path to the trace JSON file + + Yields: + Individual event dictionaries + """ + with open(trace_file, "rb") as f: + # Stream the traceEvents array + events = ijson.items(f, "traceEvents.item") + for event in events: + yield event + + +def load_trace_metadata(trace_file: Path) -> Dict[str, Any]: + """ + Load only the metadata from a trace file (not the events). + + Args: + trace_file: Path to the trace JSON file + + Returns: + Dictionary with metadata (e.g., beginningOfTime) + """ + with open(trace_file, "rb") as f: + parser = ijson.parse(f) + metadata = {} + for prefix, event, value in parser: + if prefix == "beginningOfTime": + metadata["beginningOfTime"] = value + # Stop after getting metadata, don't parse events + if prefix.startswith("traceEvents"): + break + return metadata + + +def filter_events( + events: Iterator[Dict[str, Any]], + event_names: Optional[List[str]] = None, + min_duration: Optional[int] = None, +) -> Iterator[Dict[str, Any]]: + """ + Filter events by name and/or minimum duration. + + Args: + events: Iterator of event dictionaries + event_names: List of event names to include (None = all) + min_duration: Minimum duration in microseconds (None = no filter) + + Yields: + Filtered event dictionaries + """ + for event in events: + # Filter by name + if event_names is not None: + if event.get("name") not in event_names: + continue + + # Filter by duration + if min_duration is not None: + if event.get("dur", 0) < min_duration: + continue + + yield event + + +def get_template_events(events: Iterator[Dict[str, Any]]) -> Iterator[Dict[str, Any]]: + """ + Filter for template-related events. + + Args: + events: Iterator of event dictionaries + + Yields: + Template instantiation events + """ + template_event_names = [ + "InstantiateClass", + "InstantiateFunction", + "InstantiateVariable", + "ParseTemplate", + ] + return filter_events(events, event_names=template_event_names) + + +def aggregate_by_name(events: Iterator[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: + """ + Aggregate events by name, computing total duration and count. + + Args: + events: Iterator of event dictionaries + + Returns: + Dictionary mapping event names to aggregated statistics + """ + aggregated = {} + + for event in events: + name = event.get("name", "Unknown") + duration = event.get("dur", 0) + + if name not in aggregated: + aggregated[name] = { + "count": 0, + "total_duration": 0, + "max_duration": 0, + "min_duration": float("inf"), + } + + aggregated[name]["count"] += 1 + aggregated[name]["total_duration"] += duration + aggregated[name]["max_duration"] = max( + aggregated[name]["max_duration"], duration + ) + aggregated[name]["min_duration"] = min( + aggregated[name]["min_duration"], duration + ) + + # Calculate averages + for name, stats in aggregated.items(): + if stats["count"] > 0: + stats["avg_duration"] = stats["total_duration"] / stats["count"] + if stats["min_duration"] == float("inf"): + stats["min_duration"] = 0 + + return aggregated + + +def get_top_events( + events: Iterator[Dict[str, Any]], n: int = 10, sort_by: str = "dur" +) -> List[Dict[str, Any]]: + """ + Get the top N events by a specified field. + + Args: + events: Iterator of event dictionaries + n: Number of top events to return + sort_by: Field to sort by (default: 'dur' for duration) + + Returns: + List of top N events + """ + # We need to materialize events for sorting + # This is safe for top-N queries as we only keep N items + import heapq + + # Use a min-heap to keep only top N items + # Include a counter to break ties and avoid comparing dicts + top_n = [] + counter = 0 + + for event in events: + value = event.get(sort_by, 0) + if len(top_n) < n: + heapq.heappush(top_n, (value, counter, event)) + counter += 1 + elif value > top_n[0][0]: + heapq.heapreplace(top_n, (value, counter, event)) + counter += 1 + + # Sort in descending order + return [event for _, _, event in sorted(top_n, reverse=True)] + + +def extract_template_detail(event: Dict[str, Any]) -> Optional[str]: + """ + Extract the template name/detail from an event's args. + + Args: + event: Event dictionary + + Returns: + Template detail string or None + """ + args = event.get("args", {}) + return args.get("detail") + + +def microseconds_to_seconds(microseconds: int) -> float: + """Convert microseconds to seconds.""" + return microseconds / 1_000_000 + + +def microseconds_to_milliseconds(microseconds: int) -> float: + """Convert microseconds to milliseconds.""" + return microseconds / 1_000