From d0461ef29950cac6fc658f2d4cebb2e78ff8c6ba Mon Sep 17 00:00:00 2001
From: John Shumway <jshumway@amd.com>
Date: Fri, 12 Dec 2025 15:37:45 -0500
Subject: [PATCH] Experiment with notebooks to analyze build times

---
 pyproject.toml                                |   4 +
 script/analyze_build/README.md                | 182 +++++++
 script/analyze_build/data/.gitignore          |   3 +
 .../notebooks/01_initial_exploration.ipynb    | 470 ++++++++++++++++++
 .../notebooks/02_full_build_analysis.ipynb    | 211 ++++++++
 .../03_template_overhead_analysis.ipynb       | 333 +++++++++++++
 .../04_tensor_descriptor_analysis.ipynb       | 459 +++++++++++++++++
 script/analyze_build/utils/__init__.py        |  30 ++
 script/analyze_build/utils/trace_parser.py    | 224 +++++++++
 9 files changed, 1916 insertions(+)
 create mode 100644 script/analyze_build/README.md
 create mode 100644 script/analyze_build/data/.gitignore
 create mode 100644 script/analyze_build/notebooks/01_initial_exploration.ipynb
 create mode 100644 script/analyze_build/notebooks/02_full_build_analysis.ipynb
 create mode 100644 script/analyze_build/notebooks/03_template_overhead_analysis.ipynb
 create mode 100644 script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb
 create mode 100644 script/analyze_build/utils/__init__.py
 create mode 100644 script/analyze_build/utils/trace_parser.py

diff --git a/pyproject.toml b/pyproject.toml
index 9e1457b7d8..7286c2e28b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,3 +46,7 @@ ck4inductor = "python/ck4inductor"
 
 [tool.setuptools.dynamic]
 version = { attr = "ck4inductor.__version__" }
+
+[tool.ruff]
+[tool.ruff.lint.per-file-ignores]
+"script/analyze_build/notebooks/*.ipynb" = ["E402"]
diff --git a/script/analyze_build/README.md b/script/analyze_build/README.md
new file mode 100644
index 0000000000..62b1150aed
--- /dev/null
+++ b/script/analyze_build/README.md
@@ -0,0 +1,182 @@
+# Build Trace Analysis
+
+Tools and notebooks for analyzing Clang `-ftime-trace` build performance data from the Composable Kernel project.
+
+## Overview
+
+This directory contains utilities and Jupyter notebooks for analyzing the 4,484+ JSON trace files (~46 GB) generated during compilation with `-ftime-trace`. The analysis focuses on:
+
+- Template instantiation costs
+- Compilation bottlenecks
+- Metaprogramming performance
+- Build time optimization opportunities
+
+## Directory Structure
+
+```
+script/build_analysis/
+├── notebooks/           # Jupyter notebooks for analysis
+│   └── 01_initial_exploration.ipynb
+├── utils/              # Reusable Python utilities
+│   ├── __init__.py
+│   └── trace_parser.py
+├── data/               # Cached/generated data (gitignored)
+└── README.md
+```
+
+## Prerequisites
+
+Install required Python packages:
+
+```bash
+pip install ijson pandas matplotlib seaborn jupyter
+```
+
+Or if using the project's virtual environment:
+
+```bash
+source .venv/bin/activate  # Activate the venv
+pip install ijson pandas matplotlib seaborn jupyter
+```
+
+## Quick Start
+
+1. **Navigate to the notebooks directory:**
+   ```bash
+   cd script/build_analysis/notebooks
+   ```
+
+2. **Launch Jupyter:**
+   ```bash
+   jupyter notebook
+   ```
+
+3. **Open `01_initial_exploration.ipynb`** and run the cells to analyze a sample trace file.
+
+## Notebooks
+
+### 01_initial_exploration.ipynb
+
+Initial exploration of trace data demonstrating:
+- Safe streaming JSON parsing (memory-efficient)
+- Event type distribution analysis
+- Template instantiation identification
+- Duration-based performance metrics
+- Basic visualizations
+
+**Key Features:**
+- Analyzes a single file as a test case
+- Identifies top time-consuming events
+- Finds most frequently instantiated templates
+- Generates summary statistics
+
+## Utilities
+
+### trace_parser.py
+
+Core utilities for working with trace files:
+
+- `iter_trace_files()` - Iterate over trace files in a directory
+- `stream_events()` - Stream events from a file without loading into memory
+- `load_trace_metadata()` - Load file metadata only
+- `filter_events()` - Filter events by name or duration
+- `get_template_events()` - Extract template-related events
+- `aggregate_by_name()` - Compute statistics by event type
+- `get_top_events()` - Find top N events by any field
+- `extract_template_detail()` - Get template name from event
+
+**Example usage:**
+
+```python
+from pathlib import Path
+from utils.trace_parser import stream_events, get_template_events
+
+# Stream events from a trace file
+trace_file = Path('../../build-trace/some_file.json')
+for event in stream_events(trace_file):
+    print(event['name'], event.get('dur', 0))
+
+# Get only template instantiation events
+for event in get_template_events(stream_events(trace_file)):
+    print(event['args'].get('detail'))
+```
+
+## Data Format
+
+The trace files use the [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview):
+
+```json
+{
+  "traceEvents": [
+    {
+      "pid": 1234,
+      "tid": 1234,
+      "ts": 1000,
+      "dur": 500,
+      "ph": "X",
+      "name": "InstantiateFunction",
+      "args": {
+        "detail": "template_name<Args...>"
+      }
+    }
+  ],
+  "beginningOfTime": 1234567890
+}
+```
+
+**Key fields:**
+- `name`: Event type (e.g., "InstantiateClass", "ParseFunctionDefinition")
+- `dur`: Duration in microseconds
+- `ts`: Timestamp in microseconds
+- `args.detail`: Additional information (e.g., template name)
+
+## Memory Safety
+
+All utilities are designed to handle large files safely:
+
+- **Streaming parsing**: Uses `ijson` to parse JSON incrementally
+- **Iterator-based**: Process events one at a time
+- **Minimal materialization**: Only load data into memory when necessary
+- **Top-N queries**: Use heaps to avoid loading all events
+
+This allows analysis of 4+ MB JSON files without memory issues.
+
+## Analysis Workflow
+
+1. **Exploration** (Notebook 01): Understand data structure and identify patterns
+2. **Template Analysis** (Future): Deep dive into template instantiation costs
+3. **Visualization** (Future): Create interactive charts and timelines
+4. **Optimization** (Future): Identify and track improvements
+
+## Tips
+
+- **Start small**: Analyze a single file first to understand the data
+- **Use streaming**: Always use `stream_events()` for large files
+- **Cache results**: Save aggregated data to `data/` for faster re-analysis
+- **Filter early**: Use `filter_events()` to reduce data volume
+- **Sample wisely**: For initial exploration, analyze a representative subset
+
+## Future Enhancements
+
+Planned additions:
+- Multi-file aggregation across entire build
+- Template dependency graph visualization
+- Comparative analysis (before/after optimizations)
+- Interactive flame graphs
+- Build time regression detection
+- Automated reporting
+
+## Contributing
+
+When adding new notebooks or utilities:
+1. Follow the streaming/iterator pattern for memory safety
+2. Document functions with clear docstrings
+3. Include usage examples
+4. Add visualizations where helpful
+5. Update this README
+
+## References
+
+- [Clang Time Trace Documentation](https://releases.llvm.org/11.0.0/tools/clang/docs/ClangCommandLineReference.html#cmdoption-clang-ftime-trace)
+- [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview)
+- [Template Metaprogramming Performance](https://www.youtube.com/watch?v=vwrXHznaYLA)
diff --git a/script/analyze_build/data/.gitignore b/script/analyze_build/data/.gitignore
new file mode 100644
index 0000000000..f16e0a558b
--- /dev/null
+++ b/script/analyze_build/data/.gitignore
@@ -0,0 +1,3 @@
+# Ignore all cached and generated data files
+*
+!.gitignore
diff --git a/script/analyze_build/notebooks/01_initial_exploration.ipynb b/script/analyze_build/notebooks/01_initial_exploration.ipynb
new file mode 100644
index 0000000000..e29d3deb35
--- /dev/null
+++ b/script/analyze_build/notebooks/01_initial_exploration.ipynb
@@ -0,0 +1,470 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build Trace Analysis - Initial Exploration\n",
+    "\n",
+    "This notebook demonstrates safe loading and initial analysis of Clang `-ftime-trace` JSON files.\n",
+    "\n",
+    "**Key Features:**\n",
+    "- Streaming JSON parsing to avoid memory issues\n",
+    "- Analysis of a single file as a test case\n",
+    "- Identification of template instantiation patterns\n",
+    "- Basic statistics and visualizations\n",
+    "\n",
+    "**Dataset:**\n",
+    "- 4,484 JSON trace files (~46 GB total)\n",
+    "- Located in `build-trace/` directory\n",
+    "- Chrome Trace Event Format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Standard library imports\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "from collections import Counter\n",
+    "\n",
+    "# Third-party imports\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "# Add utils to path - handle both Jupyter and VSCode notebook execution\n",
+    "notebook_dir = Path.cwd()\n",
+    "if notebook_dir.name == \"notebooks\":\n",
+    "    # Running from notebooks directory\n",
+    "    utils_path = notebook_dir.parent / \"utils\"\n",
+    "else:\n",
+    "    # Running from project root or other location\n",
+    "    utils_path = (\n",
+    "        Path(__file__).parent.parent / \"utils\"\n",
+    "        if \"__file__\" in globals()\n",
+    "        else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n",
+    "    )\n",
+    "\n",
+    "sys.path.insert(0, str(utils_path))\n",
+    "print(f\"Utils path: {utils_path}\")\n",
+    "print(f\"Utils exists: {utils_path.exists()}\")\n",
+    "\n",
+    "# Import our utilities\n",
+    "from trace_parser import (\n",
+    "    iter_trace_files,\n",
+    "    stream_events,\n",
+    "    load_trace_metadata,\n",
+    "    get_template_events,\n",
+    "    aggregate_by_name,\n",
+    "    get_top_events,\n",
+    "    extract_template_detail,\n",
+    "    microseconds_to_milliseconds,\n",
+    ")\n",
+    "\n",
+    "# Configure plotting\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "plt.rcParams[\"figure.figsize\"] = (12, 6)\n",
+    "\n",
+    "print(\"✓ Imports successful\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Dataset Overview\n",
+    "\n",
+    "Let's start by examining the trace files available."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Path to trace directory (relative to notebook location)\n",
+    "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n",
+    "\n",
+    "print(f\"Trace directory: {TRACE_DIR}\")\n",
+    "print(f\"Exists: {TRACE_DIR.exists()}\")\n",
+    "\n",
+    "# Count trace files\n",
+    "trace_files = list(iter_trace_files(TRACE_DIR))\n",
+    "print(f\"\\nTotal trace files: {len(trace_files):,}\")\n",
+    "\n",
+    "# Show first few files\n",
+    "print(\"\\nFirst 5 trace files:\")\n",
+    "for i, f in enumerate(trace_files[:5], 1):\n",
+    "    rel_path = f.relative_to(TRACE_DIR)\n",
+    "    size_mb = f.stat().st_size / (1024 * 1024)\n",
+    "    print(f\"{i}. {rel_path} ({size_mb:.2f} MB)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Single File Analysis\n",
+    "\n",
+    "Let's analyze a single trace file in detail to understand the data structure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select a sample file for detailed analysis\n",
+    "sample_file = trace_files[0]\n",
+    "print(f\"Analyzing: {sample_file.name}\")\n",
+    "print(f\"Size: {sample_file.stat().st_size / (1024 * 1024):.2f} MB\")\n",
+    "\n",
+    "# Load metadata (fast, doesn't load events)\n",
+    "metadata = load_trace_metadata(sample_file)\n",
+    "print(f\"\\nMetadata: {metadata}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Event Type Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Count events by type (streaming - memory safe)\n",
+    "event_names = Counter()\n",
+    "total_events = 0\n",
+    "\n",
+    "for event in stream_events(sample_file):\n",
+    "    event_names[event.get(\"name\", \"Unknown\")] += 1\n",
+    "    total_events += 1\n",
+    "\n",
+    "print(f\"Total events: {total_events:,}\")\n",
+    "print(f\"Unique event types: {len(event_names)}\")\n",
+    "\n",
+    "# Show top event types\n",
+    "print(\"\\nTop 15 event types by frequency:\")\n",
+    "for name, count in event_names.most_common(15):\n",
+    "    pct = (count / total_events) * 100\n",
+    "    print(f\"  {name:40s} {count:6,} ({pct:5.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize event type distribution\n",
+    "top_events_df = pd.DataFrame(\n",
+    "    event_names.most_common(20), columns=[\"Event Type\", \"Count\"]\n",
+    ")\n",
+    "\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "sns.barplot(data=top_events_df, x=\"Count\", y=\"Event Type\", palette=\"viridis\")\n",
+    "plt.title(f\"Top 20 Event Types in {sample_file.name}\", fontsize=14, fontweight=\"bold\")\n",
+    "plt.xlabel(\"Count\", fontsize=12)\n",
+    "plt.ylabel(\"Event Type\", fontsize=12)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Template Instantiation Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze template-related events\n",
+    "template_event_names = [\n",
+    "    \"InstantiateClass\",\n",
+    "    \"InstantiateFunction\",\n",
+    "    \"InstantiateVariable\",\n",
+    "    \"ParseTemplate\",\n",
+    "]\n",
+    "\n",
+    "template_counts = {name: event_names.get(name, 0) for name in template_event_names}\n",
+    "total_template_events = sum(template_counts.values())\n",
+    "\n",
+    "print(\"Template-related events:\")\n",
+    "for name, count in template_counts.items():\n",
+    "    pct = (count / total_events) * 100 if total_events > 0 else 0\n",
+    "    print(f\"  {name:30s} {count:6,} ({pct:5.2f}%)\")\n",
+    "\n",
+    "print(f\"\\nTotal template events: {total_template_events:,}\")\n",
+    "print(\n",
+    "    f\"Template events as % of total: {(total_template_events / total_events) * 100:.2f}%\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Duration Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get top events by duration\n",
+    "print(\"Top 20 longest-running events:\\n\")\n",
+    "\n",
+    "top_by_duration = get_top_events(stream_events(sample_file), n=20, sort_by=\"dur\")\n",
+    "\n",
+    "for i, event in enumerate(top_by_duration, 1):\n",
+    "    name = event.get(\"name\", \"Unknown\")\n",
+    "    dur_ms = microseconds_to_milliseconds(event.get(\"dur\", 0))\n",
+    "    detail = extract_template_detail(event) or \"\"\n",
+    "\n",
+    "    # Truncate detail if too long\n",
+    "    if len(detail) > 60:\n",
+    "        detail = detail[:57] + \"...\"\n",
+    "\n",
+    "    print(f\"{i:2d}. {name:30s} {dur_ms:8.2f} ms  {detail}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate statistics by event type\n",
+    "print(\"Aggregating event statistics (this may take a moment)...\")\n",
+    "\n",
+    "aggregated = aggregate_by_name(stream_events(sample_file))\n",
+    "\n",
+    "# Convert to DataFrame for easier analysis\n",
+    "agg_df = pd.DataFrame.from_dict(aggregated, orient=\"index\")\n",
+    "agg_df.index.name = \"event_name\"\n",
+    "agg_df.reset_index(inplace=True)\n",
+    "\n",
+    "# Convert durations to milliseconds\n",
+    "for col in [\"total_duration\", \"avg_duration\", \"max_duration\", \"min_duration\"]:\n",
+    "    agg_df[f\"{col}_ms\"] = agg_df[col].apply(microseconds_to_milliseconds)\n",
+    "\n",
+    "# Sort by total duration\n",
+    "agg_df_sorted = agg_df.sort_values(\"total_duration_ms\", ascending=False)\n",
+    "\n",
+    "print(\"\\nTop 15 event types by total duration:\")\n",
+    "print(\n",
+    "    agg_df_sorted[\n",
+    "        [\n",
+    "            \"event_name\",\n",
+    "            \"count\",\n",
+    "            \"total_duration_ms\",\n",
+    "            \"avg_duration_ms\",\n",
+    "            \"max_duration_ms\",\n",
+    "        ]\n",
+    "    ]\n",
+    "    .head(15)\n",
+    "    .to_string(index=False)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize duration distribution for top event types\n",
+    "top_15 = agg_df_sorted.head(15)\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Total duration\n",
+    "sns.barplot(\n",
+    "    data=top_15,\n",
+    "    x=\"total_duration_ms\",\n",
+    "    y=\"event_name\",\n",
+    "    hue=\"event_name\",\n",
+    "    ax=axes[0],\n",
+    "    palette=\"rocket\",\n",
+    "    legend=False,\n",
+    ")\n",
+    "axes[0].set_title(\"Total Duration by Event Type\", fontsize=12, fontweight=\"bold\")\n",
+    "axes[0].set_xlabel(\"Total Duration (ms)\", fontsize=11)\n",
+    "axes[0].set_ylabel(\"Event Type\", fontsize=11)\n",
+    "\n",
+    "# Average duration\n",
+    "sns.barplot(\n",
+    "    data=top_15,\n",
+    "    x=\"avg_duration_ms\",\n",
+    "    y=\"event_name\",\n",
+    "    hue=\"event_name\",\n",
+    "    ax=axes[1],\n",
+    "    palette=\"mako\",\n",
+    "    legend=False,\n",
+    ")\n",
+    "axes[1].set_title(\"Average Duration by Event Type\", fontsize=12, fontweight=\"bold\")\n",
+    "axes[1].set_xlabel(\"Average Duration (ms)\", fontsize=11)\n",
+    "axes[1].set_ylabel(\"\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Template Instantiation Deep Dive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze template instantiations specifically\n",
+    "print(\"Analyzing template instantiations...\\n\")\n",
+    "\n",
+    "template_details = Counter()\n",
+    "template_durations = []\n",
+    "\n",
+    "for event in get_template_events(stream_events(sample_file)):\n",
+    "    detail = extract_template_detail(event)\n",
+    "    if detail:\n",
+    "        template_details[detail] += 1\n",
+    "        template_durations.append(\n",
+    "            {\n",
+    "                \"template\": detail,\n",
+    "                \"duration_ms\": microseconds_to_milliseconds(event.get(\"dur\", 0)),\n",
+    "                \"event_type\": event.get(\"name\", \"Unknown\"),\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "print(f\"Unique templates instantiated: {len(template_details):,}\")\n",
+    "print(\"\\nTop 15 most frequently instantiated templates:\")\n",
+    "for template, count in template_details.most_common(15):\n",
+    "    # Truncate long template names\n",
+    "    display_name = template if len(template) <= 80 else template[:77] + \"...\"\n",
+    "    print(f\"  {count:4d}x  {display_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find slowest template instantiations\n",
+    "if template_durations:\n",
+    "    template_df = pd.DataFrame(template_durations)\n",
+    "    template_df_sorted = template_df.sort_values(\"duration_ms\", ascending=False)\n",
+    "\n",
+    "    print(\"Top 20 slowest template instantiations:\\n\")\n",
+    "    for i, row in enumerate(template_df_sorted.head(20).itertuples(), 1):\n",
+    "        template = (\n",
+    "            row.template if len(row.template) <= 80 else row.template[:77] + \"...\"\n",
+    "        )\n",
+    "        print(f\"{i:2d}. {row.duration_ms:8.2f} ms  [{row.event_type}]  {template}\")\n",
+    "else:\n",
+    "    print(\"No template instantiation data found.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Summary Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate total compilation time\n",
+    "total_duration_ms = agg_df[\"total_duration_ms\"].sum()\n",
+    "total_duration_sec = total_duration_ms / 1000\n",
+    "\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"SUMMARY: {sample_file.name}\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"Total events:              {total_events:,}\")\n",
+    "print(f\"Unique event types:        {len(event_names):,}\")\n",
+    "print(\n",
+    "    f\"Template events:           {total_template_events:,} ({(total_template_events / total_events) * 100:.1f}%)\"\n",
+    ")\n",
+    "print(f\"Unique templates:          {len(template_details):,}\")\n",
+    "print(\n",
+    "    f\"Total compilation time:    {total_duration_sec:.2f} seconds ({total_duration_sec / 60:.2f} minutes)\"\n",
+    ")\n",
+    "print(f\"File size:                 {sample_file.stat().st_size / (1024 * 1024):.2f} MB\")\n",
+    "print(\"=\" * 60)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Next Steps\n",
+    "\n",
+    "This initial exploration demonstrates:\n",
+    "- ✓ Safe streaming of large JSON files\n",
+    "- ✓ Event type distribution analysis\n",
+    "- ✓ Template instantiation identification\n",
+    "- ✓ Duration-based performance analysis\n",
+    "\n",
+    "**Recommended next analyses:**\n",
+    "1. **Multi-file aggregation**: Analyze patterns across all 4,484 trace files\n",
+    "2. **Template hierarchy**: Build dependency graphs of template instantiations\n",
+    "3. **Hotspot identification**: Find the most expensive templates across the entire build\n",
+    "4. **Comparative analysis**: Compare build times before/after code changes\n",
+    "5. **Visualization**: Create interactive timelines and flame graphs\n",
+    "\n",
+    "See notebooks:\n",
+    "- `02_template_analysis.ipynb` - Deep dive into template metaprogramming costs\n",
+    "- `03_visualization.ipynb` - Interactive charts and timelines"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/script/analyze_build/notebooks/02_full_build_analysis.ipynb b/script/analyze_build/notebooks/02_full_build_analysis.ipynb
new file mode 100644
index 0000000000..6ab15e5289
--- /dev/null
+++ b/script/analyze_build/notebooks/02_full_build_analysis.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Full Build Analysis - All Trace Files\n",
+    "\n",
+    "This notebook analyzes **all 4,484 trace files** to understand the complete build time breakdown.\n",
+    "\n",
+    "**Warning**: This processes ~46GB of data. We use streaming and aggregation to keep memory usage reasonable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Add utils to path\n",
+    "notebook_dir = Path.cwd()\n",
+    "if notebook_dir.name == \"notebooks\":\n",
+    "    utils_path = notebook_dir.parent / \"utils\"\n",
+    "else:\n",
+    "    utils_path = notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n",
+    "\n",
+    "sys.path.insert(0, str(utils_path))\n",
+    "\n",
+    "from trace_parser import (\n",
+    "    iter_trace_files,\n",
+    "    stream_events,\n",
+    "    microseconds_to_seconds,\n",
+    ")\n",
+    "\n",
+    "print(\"✓ Imports successful\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Collect All Trace Files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n",
+    "\n",
+    "print(f\"Scanning {TRACE_DIR}...\")\n",
+    "trace_files = list(iter_trace_files(TRACE_DIR))\n",
+    "print(f\"Found {len(trace_files):,} trace files\")\n",
+    "print(\"\\nFirst 5 files:\")\n",
+    "for i, f in enumerate(trace_files[:5], 1):\n",
+    "    print(f\"  {i}. {f.name} ({f.stat().st_size / (1024 * 1024):.2f} MB)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Aggregate Build Times Per File\n",
+    "\n",
+    "This will take a few minutes as we process all files..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_stats = []\n",
+    "\n",
+    "print(\"Processing all trace files...\")\n",
+    "for trace_file in tqdm(trace_files, desc=\"Analyzing files\"):\n",
+    "    try:\n",
+    "        total_duration = 0\n",
+    "        event_count = 0\n",
+    "        template_count = 0\n",
+    "\n",
+    "        for event in stream_events(trace_file):\n",
+    "            event_count += 1\n",
+    "            total_duration += event.get(\"dur\", 0)\n",
+    "\n",
+    "            # Count template events\n",
+    "            if event.get(\"name\") in [\"InstantiateClass\", \"InstantiateFunction\"]:\n",
+    "                template_count += 1\n",
+    "\n",
+    "        file_stats.append(\n",
+    "            {\n",
+    "                \"file\": trace_file.name,\n",
+    "                \"path\": str(trace_file.relative_to(TRACE_DIR)),\n",
+    "                \"total_duration_sec\": microseconds_to_seconds(total_duration),\n",
+    "                \"event_count\": event_count,\n",
+    "                \"template_count\": template_count,\n",
+    "                \"size_mb\": trace_file.stat().st_size / (1024 * 1024),\n",
+    "            }\n",
+    "        )\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing {trace_file.name}: {e}\")\n",
+    "\n",
+    "print(f\"\\n✓ Processed {len(file_stats):,} files\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Overall Build Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(file_stats)\n",
+    "\n",
+    "total_build_time = df[\"total_duration_sec\"].sum()\n",
+    "total_events = df[\"event_count\"].sum()\n",
+    "total_templates = df[\"template_count\"].sum()\n",
+    "\n",
+    "print(\"=\" * 70)\n",
+    "print(\"FULL BUILD ANALYSIS\")\n",
+    "print(\"=\" * 70)\n",
+    "print(f\"Total files analyzed:      {len(df):,}\")\n",
+    "print(f\"Total events:              {total_events:,}\")\n",
+    "print(f\"Total template events:     {total_templates:,}\")\n",
+    "print(\n",
+    "    f\"Total build time:          {total_build_time:.2f} seconds ({total_build_time / 60:.2f} minutes)\"\n",
+    ")\n",
+    "print(f\"Average time per file:     {df['total_duration_sec'].mean():.2f} seconds\")\n",
+    "print(f\"Median time per file:      {df['total_duration_sec'].median():.2f} seconds\")\n",
+    "print(\"=\" * 70)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Slowest Files to Compile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "slowest = df.nlargest(20, \"total_duration_sec\")\n",
+    "\n",
+    "print(\"Top 20 slowest files to compile:\\n\")\n",
+    "for i, row in enumerate(slowest.itertuples(), 1):\n",
+    "    print(\n",
+    "        f\"{i:2d}. {row.total_duration_sec:7.2f}s  {row.template_count:5,} templates  {row.file}\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Save Results for Future Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save to CSV for faster re-analysis\n",
+    "output_file = Path.cwd().parent / \"data\" / \"file_compilation_times.csv\"\n",
+    "df.to_csv(output_file, index=False)\n",
+    "print(f\"✓ Saved results to {output_file}\")\n",
+    "print(f\"  File size: {output_file.stat().st_size / 1024:.2f} KB\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb b/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb
new file mode 100644
index 0000000000..fe34887c19
--- /dev/null
+++ b/script/analyze_build/notebooks/03_template_overhead_analysis.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Template Metaprogramming Overhead Analysis\n",
+    "\n",
+    "**Goal**: Understand how much build time is spent in template instantiation vs other compilation phases.\n",
+    "\n",
+    "**Key Questions**:\n",
+    "1. What percentage of total build time is template instantiation?\n",
+    "2. Which templates are the most expensive?\n",
+    "3. How much time is spent in GPU kernel compilation (Backend/CodeGen)?\n",
+    "4. What are the biggest optimization opportunities?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "notebook_dir = Path.cwd()\n",
+    "utils_path = (\n",
+    "    notebook_dir.parent / \"utils\"\n",
+    "    if notebook_dir.name == \"notebooks\"\n",
+    "    else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n",
+    ")\n",
+    "sys.path.insert(0, str(utils_path))\n",
+    "\n",
+    "from trace_parser import (\n",
+    "    iter_trace_files,\n",
+    "    stream_events,\n",
+    "    get_template_events,\n",
+    "    extract_template_detail,\n",
+    "    microseconds_to_seconds,\n",
+    ")\n",
+    "\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "plt.rcParams[\"figure.figsize\"] = (14, 8)\n",
+    "\n",
+    "print(\"✓ Imports successful\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load Pre-computed File Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the CSV we created in notebook 02\n",
+    "data_file = Path.cwd().parent / \"data\" / \"file_compilation_times.csv\"\n",
+    "df_files = pd.read_csv(data_file)\n",
+    "\n",
+    "total_build_hours = df_files[\"total_duration_sec\"].sum() / 3600\n",
+    "print(f\"Total build time: {total_build_hours:.2f} hours\")\n",
+    "print(f\"Total files: {len(df_files):,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Aggregate Event Types Across All Files\n",
+    "\n",
+    "This analyzes what percentage of time is spent in different compilation phases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n",
+    "trace_files = list(iter_trace_files(TRACE_DIR))\n",
+    "\n",
+    "# Aggregate event statistics across all files\n",
+    "global_event_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n",
+    "\n",
+    "print(\"Aggregating event types across all files...\")\n",
+    "for trace_file in tqdm(\n",
+    "    trace_files[:100], desc=\"Processing files\"\n",
+    "):  # Start with 100 files for speed\n",
+    "    try:\n",
+    "        for event in stream_events(trace_file):\n",
+    "            name = event.get(\"name\", \"Unknown\")\n",
+    "            dur = event.get(\"dur\", 0)\n",
+    "\n",
+    "            global_event_stats[name][\"count\"] += 1\n",
+    "            global_event_stats[name][\"total_duration\"] += dur\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error: {trace_file.name}: {e}\")\n",
+    "\n",
+    "print(f\"\\n✓ Processed {len(global_event_stats)} unique event types\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Template vs Non-Template Time Breakdown"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate template overhead\n",
+    "template_events = [\"InstantiateClass\", \"InstantiateFunction\", \"InstantiateVariable\"]\n",
+    "backend_events = [\"Backend\", \"CodeGen Function\", \"OptFunction\"]\n",
+    "\n",
+    "template_time = sum(\n",
+    "    global_event_stats[e][\"total_duration\"]\n",
+    "    for e in template_events\n",
+    "    if e in global_event_stats\n",
+    ")\n",
+    "backend_time = sum(\n",
+    "    global_event_stats[e][\"total_duration\"]\n",
+    "    for e in backend_events\n",
+    "    if e in global_event_stats\n",
+    ")\n",
+    "total_time = sum(stats[\"total_duration\"] for stats in global_event_stats.values())\n",
+    "\n",
+    "template_hours = microseconds_to_seconds(template_time) / 3600\n",
+    "backend_hours = microseconds_to_seconds(backend_time) / 3600\n",
+    "total_hours = microseconds_to_seconds(total_time) / 3600\n",
+    "\n",
+    "print(\"=\" * 70)\n",
+    "print(\"COMPILATION PHASE BREAKDOWN (100 files sample)\")\n",
+    "print(\"=\" * 70)\n",
+    "print(\n",
+    "    f\"Template instantiation:    {template_hours:8.2f} hours ({template_time / total_time * 100:5.1f}%)\"\n",
+    ")\n",
+    "print(\n",
+    "    f\"Backend/CodeGen:           {backend_hours:8.2f} hours ({backend_time / total_time * 100:5.1f}%)\"\n",
+    ")\n",
+    "print(\n",
+    "    f\"Other compilation phases:  {total_hours - template_hours - backend_hours:8.2f} hours\"\n",
+    ")\n",
+    "print(f\"Total:                     {total_hours:8.2f} hours\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "# Extrapolate to full build\n",
+    "scale_factor = len(trace_files) / 100\n",
+    "print(f\"\\nExtrapolated to full build ({len(trace_files)} files):\")\n",
+    "print(f\"  Template time: ~{template_hours * scale_factor:.1f} hours\")\n",
+    "print(f\"  Backend time:  ~{backend_hours * scale_factor:.1f} hours\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Top Event Types by Duration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert to DataFrame for analysis\n",
+    "event_df = pd.DataFrame(\n",
+    "    [\n",
+    "        {\n",
+    "            \"event_type\": name,\n",
+    "            \"count\": stats[\"count\"],\n",
+    "            \"total_hours\": microseconds_to_seconds(stats[\"total_duration\"]) / 3600,\n",
+    "            \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n",
+    "        }\n",
+    "        for name, stats in global_event_stats.items()\n",
+    "    ]\n",
+    ").sort_values(\"total_hours\", ascending=False)\n",
+    "\n",
+    "print(\"Top 20 event types by total duration:\\n\")\n",
+    "print(event_df.head(20).to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Visualization: Time Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "top_15 = event_df.head(15)\n",
+    "\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "sns.barplot(\n",
+    "    data=top_15,\n",
+    "    x=\"total_hours\",\n",
+    "    y=\"event_type\",\n",
+    "    hue=\"event_type\",\n",
+    "    palette=\"viridis\",\n",
+    "    legend=False,\n",
+    ")\n",
+    "plt.title(\n",
+    "    \"Top 15 Event Types by Total Duration (100 file sample)\",\n",
+    "    fontsize=14,\n",
+    "    fontweight=\"bold\",\n",
+    ")\n",
+    "plt.xlabel(\"Total Duration (hours)\", fontsize=12)\n",
+    "plt.ylabel(\"Event Type\", fontsize=12)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Most Expensive Templates\n",
+    "\n",
+    "Identify which specific templates consume the most time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n",
+    "\n",
+    "print(\"Analyzing template instantiations...\")\n",
+    "for trace_file in tqdm(trace_files[:100], desc=\"Processing templates\"):\n",
+    "    try:\n",
+    "        for event in get_template_events(stream_events(trace_file)):\n",
+    "            detail = extract_template_detail(event)\n",
+    "            if detail:\n",
+    "                # Truncate very long template names for grouping\n",
+    "                key = detail[:200] if len(detail) > 200 else detail\n",
+    "                template_stats[key][\"count\"] += 1\n",
+    "                template_stats[key][\"total_duration\"] += event.get(\"dur\", 0)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error: {e}\")\n",
+    "\n",
+    "print(f\"\\n✓ Found {len(template_stats):,} unique templates\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert to DataFrame and sort\n",
+    "template_df = pd.DataFrame(\n",
+    "    [\n",
+    "        {\n",
+    "            \"template\": name,\n",
+    "            \"count\": stats[\"count\"],\n",
+    "            \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n",
+    "            \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n",
+    "        }\n",
+    "        for name, stats in template_stats.items()\n",
+    "    ]\n",
+    ").sort_values(\"total_seconds\", ascending=False)\n",
+    "\n",
+    "print(\"Top 20 most expensive templates (by total time):\\n\")\n",
+    "for i, row in enumerate(template_df.head(20).itertuples(), 1):\n",
+    "    template_name = (\n",
+    "        row.template[:80] + \"...\" if len(row.template) > 80 else row.template\n",
+    "    )\n",
+    "    print(f\"{i:2d}. {row.total_seconds:7.2f}s  {row.count:5,}x  {template_name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Key Findings\n",
+    "\n",
+    "Based on the analysis:\n",
+    "\n",
+    "1. **Template overhead is significant**: ~64% of compilation events are template instantiations\n",
+    "2. **Slowest files**: LayerNorm and RMSNorm kernels take 40-45 minutes each to compile\n",
+    "3. **Optimization opportunities**:\n",
+    "   - Reduce template instantiation depth\n",
+    "   - Use explicit instantiation to reduce redundant compilations\n",
+    "   - Consider extern templates for frequently used types\n",
+    "   - Profile specific template patterns causing the most overhead"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb b/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb
new file mode 100644
index 0000000000..5244979fc0
--- /dev/null
+++ b/script/analyze_build/notebooks/04_tensor_descriptor_analysis.ipynb
@@ -0,0 +1,459 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TensorDescriptor Template Analysis\n",
+    "\n",
+    "**Focus**: Deep dive into `ck::TensorDescriptor` template instantiation costs.\n",
+    "\n",
+    "This notebook analyzes:\n",
+    "1. How many times `TensorDescriptor` is instantiated\n",
+    "2. Total time spent instantiating this template\n",
+    "3. Most expensive `TensorDescriptor` instantiations\n",
+    "4. Which files spend the most time on `TensorDescriptor`\n",
+    "5. Common template parameter patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "notebook_dir = Path.cwd()\n",
+    "utils_path = (\n",
+    "    notebook_dir.parent / \"utils\"\n",
+    "    if notebook_dir.name == \"notebooks\"\n",
+    "    else notebook_dir / \"script\" / \"build_analysis\" / \"utils\"\n",
+    ")\n",
+    "sys.path.insert(0, str(utils_path))\n",
+    "\n",
+    "from trace_parser import (\n",
+    "    iter_trace_files,\n",
+    "    stream_events,\n",
+    "    get_template_events,\n",
+    "    extract_template_detail,\n",
+    "    microseconds_to_seconds,\n",
+    "    microseconds_to_milliseconds,\n",
+    ")\n",
+    "\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "plt.rcParams[\"figure.figsize\"] = (14, 8)\n",
+    "\n",
+    "print(\"✓ Imports successful\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Scan for TensorDescriptor Instantiations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TRACE_DIR = Path.cwd().parent.parent.parent / \"build-trace\"\n",
+    "trace_files = list(iter_trace_files(TRACE_DIR))\n",
+    "\n",
+    "# Collect all TensorDescriptor instantiations\n",
+    "tensor_desc_data = []\n",
+    "file_tensor_stats = defaultdict(lambda: {\"count\": 0, \"total_duration\": 0})\n",
+    "\n",
+    "print(\"Scanning for ck::TensorDescriptor instantiations...\")\n",
+    "print(\"(Processing 200 files for detailed analysis)\\n\")\n",
+    "\n",
+    "for trace_file in tqdm(trace_files[:200], desc=\"Analyzing files\"):\n",
+    "    try:\n",
+    "        file_count = 0\n",
+    "        file_duration = 0\n",
+    "\n",
+    "        for event in get_template_events(stream_events(trace_file)):\n",
+    "            detail = extract_template_detail(event)\n",
+    "            if detail and \"TensorDescriptor\" in detail:\n",
+    "                duration = event.get(\"dur\", 0)\n",
+    "\n",
+    "                tensor_desc_data.append(\n",
+    "                    {\n",
+    "                        \"file\": trace_file.name,\n",
+    "                        \"template\": detail,\n",
+    "                        \"event_type\": event.get(\"name\"),\n",
+    "                        \"duration_ms\": microseconds_to_milliseconds(duration),\n",
+    "                        \"duration_us\": duration,\n",
+    "                    }\n",
+    "                )\n",
+    "\n",
+    "                file_count += 1\n",
+    "                file_duration += duration\n",
+    "\n",
+    "        if file_count > 0:\n",
+    "            file_tensor_stats[trace_file.name][\"count\"] = file_count\n",
+    "            file_tensor_stats[trace_file.name][\"total_duration\"] = file_duration\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing {trace_file.name}: {e}\")\n",
+    "\n",
+    "print(f\"\\n✓ Found {len(tensor_desc_data):,} TensorDescriptor instantiations\")\n",
+    "print(f\"✓ Across {len(file_tensor_stats)} files\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Overall TensorDescriptor Statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(tensor_desc_data)\n",
+    "\n",
+    "total_time_sec = df[\"duration_us\"].sum() / 1_000_000\n",
+    "total_time_hours = total_time_sec / 3600\n",
+    "\n",
+    "print(\"=\" * 70)\n",
+    "print(\"TENSORDESCRIPTOR ANALYSIS (200 file sample)\")\n",
+    "print(\"=\" * 70)\n",
+    "print(f\"Total instantiations:      {len(df):,}\")\n",
+    "print(\n",
+    "    f\"Total time:                {total_time_hours:.2f} hours ({total_time_sec / 60:.2f} minutes)\"\n",
+    ")\n",
+    "print(f\"Average per instantiation: {df['duration_ms'].mean():.2f} ms\")\n",
+    "print(f\"Median per instantiation:  {df['duration_ms'].median():.2f} ms\")\n",
+    "print(f\"Max single instantiation:  {df['duration_ms'].max():.2f} ms\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "# Extrapolate to full build\n",
+    "scale_factor = len(trace_files) / 200\n",
+    "print(f\"\\nExtrapolated to full build ({len(trace_files)} files):\")\n",
+    "print(\n",
+    "    f\"  Estimated TensorDescriptor time: ~{total_time_hours * scale_factor:.1f} hours\"\n",
+    ")\n",
+    "print(f\"  Estimated instantiations: ~{len(df) * scale_factor:,.0f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Files with Most TensorDescriptor Overhead"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate by file\n",
+    "file_df = pd.DataFrame(\n",
+    "    [\n",
+    "        {\n",
+    "            \"file\": filename,\n",
+    "            \"count\": stats[\"count\"],\n",
+    "            \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n",
+    "            \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n",
+    "        }\n",
+    "        for filename, stats in file_tensor_stats.items()\n",
+    "    ]\n",
+    ").sort_values(\"total_seconds\", ascending=False)\n",
+    "\n",
+    "print(\"Top 20 files by TensorDescriptor compilation time:\\n\")\n",
+    "for i, row in enumerate(file_df.head(20).itertuples(), 1):\n",
+    "    print(\n",
+    "        f\"{i:2d}. {row.total_seconds:7.2f}s  {row.count:4,} instantiations  {row.file}\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Most Expensive TensorDescriptor Variants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Group by template signature (first 150 chars to group similar templates)\n",
+    "template_groups = defaultdict(\n",
+    "    lambda: {\"count\": 0, \"total_duration\": 0, \"max_duration\": 0}\n",
+    ")\n",
+    "\n",
+    "for _, row in df.iterrows():\n",
+    "    # Extract template signature (truncate for grouping)\n",
+    "    template = row[\"template\"]\n",
+    "    key = template[:150] if len(template) > 150 else template\n",
+    "\n",
+    "    template_groups[key][\"count\"] += 1\n",
+    "    template_groups[key][\"total_duration\"] += row[\"duration_us\"]\n",
+    "    template_groups[key][\"max_duration\"] = max(\n",
+    "        template_groups[key][\"max_duration\"], row[\"duration_ms\"]\n",
+    "    )\n",
+    "\n",
+    "# Convert to DataFrame\n",
+    "template_df = pd.DataFrame(\n",
+    "    [\n",
+    "        {\n",
+    "            \"template\": name,\n",
+    "            \"count\": stats[\"count\"],\n",
+    "            \"total_seconds\": microseconds_to_seconds(stats[\"total_duration\"]),\n",
+    "            \"avg_ms\": stats[\"total_duration\"] / stats[\"count\"] / 1000,\n",
+    "            \"max_ms\": stats[\"max_duration\"],\n",
+    "        }\n",
+    "        for name, stats in template_groups.items()\n",
+    "    ]\n",
+    ").sort_values(\"total_seconds\", ascending=False)\n",
+    "\n",
+    "print(\"Top 20 TensorDescriptor variants by total time:\\n\")\n",
+    "for i, row in enumerate(template_df.head(20).itertuples(), 1):\n",
+    "    template_name = (\n",
+    "        row.template[:100] + \"...\" if len(row.template) > 100 else row.template\n",
+    "    )\n",
+    "    print(\n",
+    "        f\"{i:2d}. {row.total_seconds:6.2f}s  {row.count:4,}x  avg:{row.avg_ms:6.2f}ms  {template_name}\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Slowest Individual Instantiations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "slowest = df.nlargest(20, \"duration_ms\")\n",
+    "\n",
+    "print(\"Top 20 slowest individual TensorDescriptor instantiations:\\n\")\n",
+    "for i, row in enumerate(slowest.itertuples(), 1):\n",
+    "    template_name = (\n",
+    "        row.template[:100] + \"...\" if len(row.template) > 100 else row.template\n",
+    "    )\n",
+    "    print(f\"{i:2d}. {row.duration_ms:7.2f}ms  [{row.event_type}]  {template_name}\")\n",
+    "    print(f\"    File: {row.file}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Template Parameter Analysis\n",
+    "\n",
+    "Extract common patterns in TensorDescriptor template parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract template parameter patterns\n",
+    "print(\"Analyzing template parameter patterns...\\n\")\n",
+    "\n",
+    "# Count Tuple usage\n",
+    "tuple_count = sum(1 for t in df[\"template\"] if \"Tuple<\" in t)\n",
+    "print(\n",
+    "    f\"Templates with Tuple:      {tuple_count:,} ({tuple_count / len(df) * 100:.1f}%)\"\n",
+    ")\n",
+    "\n",
+    "# Count Embed usage\n",
+    "embed_count = sum(1 for t in df[\"template\"] if \"Embed<\" in t)\n",
+    "print(\n",
+    "    f\"Templates with Embed:      {embed_count:,} ({embed_count / len(df) * 100:.1f}%)\"\n",
+    ")\n",
+    "\n",
+    "# Count UnMerge usage\n",
+    "unmerge_count = sum(1 for t in df[\"template\"] if \"UnMerge<\" in t)\n",
+    "print(\n",
+    "    f\"Templates with UnMerge:    {unmerge_count:,} ({unmerge_count / len(df) * 100:.1f}%)\"\n",
+    ")\n",
+    "\n",
+    "# Count Merge usage\n",
+    "merge_count = sum(1 for t in df[\"template\"] if \"Merge<\" in t)\n",
+    "print(\n",
+    "    f\"Templates with Merge:      {merge_count:,} ({merge_count / len(df) * 100:.1f}%)\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Visualization: Distribution of Instantiation Times"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Histogram of durations\n",
+    "axes[0].hist(df[\"duration_ms\"], bins=50, edgecolor=\"black\", alpha=0.7)\n",
+    "axes[0].set_xlabel(\"Duration (ms)\", fontsize=11)\n",
+    "axes[0].set_ylabel(\"Count\", fontsize=11)\n",
+    "axes[0].set_title(\n",
+    "    \"Distribution of TensorDescriptor Instantiation Times\",\n",
+    "    fontsize=12,\n",
+    "    fontweight=\"bold\",\n",
+    ")\n",
+    "axes[0].axvline(\n",
+    "    df[\"duration_ms\"].median(),\n",
+    "    color=\"red\",\n",
+    "    linestyle=\"--\",\n",
+    "    label=f\"Median: {df['duration_ms'].median():.2f}ms\",\n",
+    ")\n",
+    "axes[0].legend()\n",
+    "\n",
+    "# Top files by TensorDescriptor time\n",
+    "top_files = file_df.head(15)\n",
+    "sns.barplot(\n",
+    "    data=top_files,\n",
+    "    x=\"total_seconds\",\n",
+    "    y=\"file\",\n",
+    "    hue=\"file\",\n",
+    "    ax=axes[1],\n",
+    "    palette=\"rocket\",\n",
+    "    legend=False,\n",
+    ")\n",
+    "axes[1].set_title(\n",
+    "    \"Top 15 Files by TensorDescriptor Time\", fontsize=12, fontweight=\"bold\"\n",
+    ")\n",
+    "axes[1].set_xlabel(\"Total Duration (seconds)\", fontsize=11)\n",
+    "axes[1].set_ylabel(\"File\", fontsize=11)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Sample Template Signatures\n",
+    "\n",
+    "Show examples of the most common TensorDescriptor patterns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Sample TensorDescriptor template signatures:\\n\")\n",
+    "print(\"=\" * 100)\n",
+    "\n",
+    "# Show a few examples of different patterns\n",
+    "samples = template_df.head(10)\n",
+    "for i, row in enumerate(samples.itertuples(), 1):\n",
+    "    print(\n",
+    "        f\"\\n{i}. Count: {row.count:,}  Total: {row.total_seconds:.2f}s  Avg: {row.avg_ms:.2f}ms\"\n",
+    "    )\n",
+    "    print(f\"   {row.template}\")\n",
+    "    print(\"-\" * 100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Key Findings & Recommendations\n",
+    "\n",
+    "Based on the analysis, we can identify:\n",
+    "\n",
+    "### Optimization Opportunities\n",
+    "1. **Frequently instantiated patterns**: Templates instantiated 100+ times are candidates for explicit instantiation\n",
+    "2. **Expensive individual instantiations**: Templates taking >100ms might benefit from simplification\n",
+    "3. **Hot files**: Files spending >10s on TensorDescriptor could be refactored\n",
+    "\n",
+    "### Next Steps\n",
+    "- Identify common template parameter combinations for explicit instantiation\n",
+    "- Analyze template depth/nesting levels\n",
+    "- Compare TensorDescriptor overhead across different kernel types\n",
+    "- Profile specific transform operations (Embed, UnMerge, Merge)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Save Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save detailed results\n",
+    "output_dir = Path.cwd().parent / \"data\"\n",
+    "\n",
+    "# Save all TensorDescriptor instantiations\n",
+    "df.to_csv(output_dir / \"tensor_descriptor_instantiations.csv\", index=False)\n",
+    "print(f\"✓ Saved {len(df):,} instantiations to tensor_descriptor_instantiations.csv\")\n",
+    "\n",
+    "# Save aggregated template statistics\n",
+    "template_df.to_csv(output_dir / \"tensor_descriptor_templates.csv\", index=False)\n",
+    "print(\n",
+    "    f\"✓ Saved {len(template_df):,} unique templates to tensor_descriptor_templates.csv\"\n",
+    ")\n",
+    "\n",
+    "# Save file statistics\n",
+    "file_df.to_csv(output_dir / \"tensor_descriptor_by_file.csv\", index=False)\n",
+    "print(f\"✓ Saved {len(file_df):,} file stats to tensor_descriptor_by_file.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/script/analyze_build/utils/__init__.py b/script/analyze_build/utils/__init__.py
new file mode 100644
index 0000000000..3414e92b43
--- /dev/null
+++ b/script/analyze_build/utils/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""Build trace analysis utilities."""
+
+from .trace_parser import (
+    iter_trace_files,
+    stream_events,
+    load_trace_metadata,
+    filter_events,
+    get_template_events,
+    aggregate_by_name,
+    get_top_events,
+    extract_template_detail,
+    microseconds_to_seconds,
+    microseconds_to_milliseconds,
+)
+
+__all__ = [
+    "iter_trace_files",
+    "stream_events",
+    "load_trace_metadata",
+    "filter_events",
+    "get_template_events",
+    "aggregate_by_name",
+    "get_top_events",
+    "extract_template_detail",
+    "microseconds_to_seconds",
+    "microseconds_to_milliseconds",
+]
diff --git a/script/analyze_build/utils/trace_parser.py b/script/analyze_build/utils/trace_parser.py
new file mode 100644
index 0000000000..b8c6002e5c
--- /dev/null
+++ b/script/analyze_build/utils/trace_parser.py
@@ -0,0 +1,224 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Utilities for parsing and analyzing Clang -ftime-trace JSON files.
+
+These utilities are designed to handle large trace files safely by using
+streaming and incremental processing to avoid memory issues.
+"""
+
+from pathlib import Path
+from typing import Iterator, Dict, Any, List, Optional
+import ijson
+
+
+def iter_trace_files(trace_dir: Path, pattern: str = "*.json") -> Iterator[Path]:
+    """
+    Iterate over trace JSON files in a directory.
+
+    Args:
+        trace_dir: Directory containing trace files
+        pattern: Glob pattern for matching files (default: "*.json")
+
+    Yields:
+        Path objects for each matching trace file
+    """
+    trace_path = Path(trace_dir)
+    if not trace_path.exists():
+        raise FileNotFoundError(f"Trace directory not found: {trace_dir}")
+
+    for file_path in trace_path.rglob(pattern):
+        if file_path.is_file():
+            yield file_path
+
+
+def stream_events(trace_file: Path) -> Iterator[Dict[str, Any]]:
+    """
+    Stream events from a trace file without loading the entire file into memory.
+
+    Args:
+        trace_file: Path to the trace JSON file
+
+    Yields:
+        Individual event dictionaries
+    """
+    with open(trace_file, "rb") as f:
+        # Stream the traceEvents array
+        events = ijson.items(f, "traceEvents.item")
+        for event in events:
+            yield event
+
+
+def load_trace_metadata(trace_file: Path) -> Dict[str, Any]:
+    """
+    Load only the metadata from a trace file (not the events).
+
+    Args:
+        trace_file: Path to the trace JSON file
+
+    Returns:
+        Dictionary with metadata (e.g., beginningOfTime)
+    """
+    with open(trace_file, "rb") as f:
+        parser = ijson.parse(f)
+        metadata = {}
+        for prefix, event, value in parser:
+            if prefix == "beginningOfTime":
+                metadata["beginningOfTime"] = value
+            # Stop after getting metadata, don't parse events
+            if prefix.startswith("traceEvents"):
+                break
+        return metadata
+
+
+def filter_events(
+    events: Iterator[Dict[str, Any]],
+    event_names: Optional[List[str]] = None,
+    min_duration: Optional[int] = None,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Filter events by name and/or minimum duration.
+
+    Args:
+        events: Iterator of event dictionaries
+        event_names: List of event names to include (None = all)
+        min_duration: Minimum duration in microseconds (None = no filter)
+
+    Yields:
+        Filtered event dictionaries
+    """
+    for event in events:
+        # Filter by name
+        if event_names is not None:
+            if event.get("name") not in event_names:
+                continue
+
+        # Filter by duration
+        if min_duration is not None:
+            if event.get("dur", 0) < min_duration:
+                continue
+
+        yield event
+
+
+def get_template_events(events: Iterator[Dict[str, Any]]) -> Iterator[Dict[str, Any]]:
+    """
+    Filter for template-related events.
+
+    Args:
+        events: Iterator of event dictionaries
+
+    Yields:
+        Template instantiation events
+    """
+    template_event_names = [
+        "InstantiateClass",
+        "InstantiateFunction",
+        "InstantiateVariable",
+        "ParseTemplate",
+    ]
+    return filter_events(events, event_names=template_event_names)
+
+
+def aggregate_by_name(events: Iterator[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    """
+    Aggregate events by name, computing total duration and count.
+
+    Args:
+        events: Iterator of event dictionaries
+
+    Returns:
+        Dictionary mapping event names to aggregated statistics
+    """
+    aggregated = {}
+
+    for event in events:
+        name = event.get("name", "Unknown")
+        duration = event.get("dur", 0)
+
+        if name not in aggregated:
+            aggregated[name] = {
+                "count": 0,
+                "total_duration": 0,
+                "max_duration": 0,
+                "min_duration": float("inf"),
+            }
+
+        aggregated[name]["count"] += 1
+        aggregated[name]["total_duration"] += duration
+        aggregated[name]["max_duration"] = max(
+            aggregated[name]["max_duration"], duration
+        )
+        aggregated[name]["min_duration"] = min(
+            aggregated[name]["min_duration"], duration
+        )
+
+    # Calculate averages
+    for name, stats in aggregated.items():
+        if stats["count"] > 0:
+            stats["avg_duration"] = stats["total_duration"] / stats["count"]
+        if stats["min_duration"] == float("inf"):
+            stats["min_duration"] = 0
+
+    return aggregated
+
+
+def get_top_events(
+    events: Iterator[Dict[str, Any]], n: int = 10, sort_by: str = "dur"
+) -> List[Dict[str, Any]]:
+    """
+    Get the top N events by a specified field.
+
+    Args:
+        events: Iterator of event dictionaries
+        n: Number of top events to return
+        sort_by: Field to sort by (default: 'dur' for duration)
+
+    Returns:
+        List of top N events
+    """
+    # We need to materialize events for sorting
+    # This is safe for top-N queries as we only keep N items
+    import heapq
+
+    # Use a min-heap to keep only top N items
+    # Include a counter to break ties and avoid comparing dicts
+    top_n = []
+    counter = 0
+
+    for event in events:
+        value = event.get(sort_by, 0)
+        if len(top_n) < n:
+            heapq.heappush(top_n, (value, counter, event))
+            counter += 1
+        elif value > top_n[0][0]:
+            heapq.heapreplace(top_n, (value, counter, event))
+            counter += 1
+
+    # Sort in descending order
+    return [event for _, _, event in sorted(top_n, reverse=True)]
+
+
+def extract_template_detail(event: Dict[str, Any]) -> Optional[str]:
+    """
+    Extract the template name/detail from an event's args.
+
+    Args:
+        event: Event dictionary
+
+    Returns:
+        Template detail string or None
+    """
+    args = event.get("args", {})
+    return args.get("detail")
+
+
+def microseconds_to_seconds(microseconds: int) -> float:
+    """Convert microseconds to seconds."""
+    return microseconds / 1_000_000
+
+
+def microseconds_to_milliseconds(microseconds: int) -> float:
+    """Convert microseconds to milliseconds."""
+    return microseconds / 1_000