diff --git a/.claude/skills/ck-build-analysis b/.claude/skills/ck-build-analysis
index 1543705a51..7113001772 100755
--- a/.claude/skills/ck-build-analysis
+++ b/.claude/skills/ck-build-analysis
@@ -183,9 +183,10 @@ import re
 import sys
 from collections import defaultdict
 from datetime import datetime
+from jinja2 import Environment, FileSystemLoader
 
 if len(sys.argv) < 4:
-    print("Usage: analyze.py <trace_file> <output_file> <target> <granularity> <build_time>")
+    print("Usage: analyze.py <trace_file> <output_file> <target> <granularity> <build_time> <template_dir>")
     sys.exit(1)
 
 trace_file = sys.argv[1]
@@ -193,6 +194,7 @@ output_file = sys.argv[2]
 target = sys.argv[3]
 granularity = sys.argv[4]
 build_time = sys.argv[5]
+template_dir = sys.argv[6]
 
 print(f'Loading trace file: {trace_file}')
 with open(trace_file, 'r') as f:
@@ -223,8 +225,7 @@ for event in data.get('traceEvents', []):
             template_stats[template_name]['count'] += 1
             template_stats[template_name]['total_dur'] += dur
 
-print('Sorting and generating report...')
-sorted_templates = sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True)
+print('Sorting data...')
 sorted_phases = sorted(phase_stats.items(), key=lambda x: x[1], reverse=True)
 top_individual.sort(key=lambda x: x['dur'], reverse=True)
 
@@ -233,126 +234,101 @@ total_trace_time = sum(phase_stats.values())
 total_events = len(data.get('traceEvents', []))
 total_inst = sum(s['count'] for s in template_stats.values())
 
-report = []
-report.append('# Composable Kernel Build Time Analysis Report')
-report.append('')
-report.append(f'**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
-report.append(f'**Target:** {target}')
-report.append(f'**Granularity:** {granularity}µs')
-report.append('')
-report.append('## Executive Summary')
-report.append('')
-report.append(f'- **Wall Clock Time:** {build_time} seconds')
-report.append(f'- **Trace Time:** {total_trace_time/1000:.1f} seconds')
-report.append(f'- **Template Instantiation Time:** {total_template_time/1000:.1f} seconds ({100*total_template_time/total_trace_time:.1f}% of trace)')
-report.append(f'- **Total Events Captured:** {total_events:,}')
-report.append(f'- **Total Template Instantiations:** {total_inst:,}')
-report.append(f'- **Unique Template Families:** {len(sorted_templates)}')
-report.append('')
-report.append('## Compilation Phase Breakdown')
-report.append('')
-report.append('| Phase | Time (ms) | Time (s) | % of Total |')
-report.append('|-------|-----------|----------|------------|')
-for phase, dur in sorted_phases[:20]:
-    pct = 100 * dur / total_trace_time
-    report.append(f'| {phase:<40} | {dur:>9.2f} | {dur/1000:>8.2f} | {pct:>9.1f}% |')
-report.append('')
-report.append('## Top 30 Most Expensive Individual Instantiations')
-report.append('')
-report.append('| Rank | Template | Type | Time (ms) |')
-report.append('|------|----------|------|-----------|')
-for i, inst in enumerate(top_individual[:30], 1):
-    detail = inst['detail'][:70] + '...' if len(inst['detail']) > 70 else inst['detail']
-    inst_type = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class'
-    report.append(f'| {i:>4} | {detail:<70} | {inst_type:<5} | {inst["dur"]:>9.2f} |')
-report.append('')
-report.append('## Template Families by Total Time (Top 50)')
-report.append('')
-report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total |')
-report.append('|------|-----------------|-------|------------|----------|------------|')
-for i, (name, stats) in enumerate(sorted_templates[:50], 1):
-    count = stats['count']
-    total = stats['total_dur']
-    avg = total / count if count > 0 else 0
-    pct = 100 * total / total_template_time if total_template_time > 0 else 0
-    display_name = name[:40] + '...' if len(name) > 40 else name
-    report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} | {pct:>9.1f}% |')
-report.append('')
-report.append('## Template Families by Instantiation Count (Top 50)')
-report.append('')
-sorted_by_count = sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True)
-report.append('| Rank | Template Family | Count | Total (ms) | Avg (ms) |')
-report.append('|------|-----------------|-------|------------|----------|')
-for i, (name, stats) in enumerate(sorted_by_count[:50], 1):
-    count = stats['count']
-    total = stats['total_dur']
-    avg = total / count if count > 0 else 0
-    display_name = name[:40] + '...' if len(name) > 40 else name
-    report.append(f'| {i:>4} | {display_name:<43} | {count:>5} | {total:>10.2f} | {avg:>8.2f} |')
-report.append('')
-report.append('## Key Insights')
-report.append('')
-report.append('### 1. Template Instantiation Impact')
-report.append(f'- Template instantiation accounts for {100*total_template_time/total_trace_time:.1f}% of total trace time')
-if len(sorted_templates) >= 10:
-    top10_pct = 100*sum(s[1]["total_dur"] for s in sorted_templates[:10])/total_template_time
-    report.append(f'- Top 10 template families account for {top10_pct:.1f}% of instantiation time')
-report.append('')
-report.append('### 2. Most Expensive Templates')
-if len(sorted_templates) > 0:
-    report.append(f'- **{sorted_templates[0][0]}**: {sorted_templates[0][1]["count"]:,} instantiations, {sorted_templates[0][1]["total_dur"]/1000:.2f}s total')
-if len(sorted_templates) > 1:
-    avg = sorted_templates[1][1]["total_dur"] / sorted_templates[1][1]["count"]
-    report.append(f'- **{sorted_templates[1][0]}**: {sorted_templates[1][1]["count"]:,} instantiations, {avg:.2f}ms average')
-report.append('')
-report.append('## Optimization Recommendations')
-report.append('')
-report.append('### Short Term')
-report.append('1. **Focus on High-Impact Templates**: Address top 10 families first')
-report.append('2. **Explicit Template Instantiation**: Pre-instantiate common configurations')
-report.append('3. **Extern Templates**: Mark frequently-used templates as extern in headers')
-report.append('')
-report.append('### Medium Term')
-report.append('1. **Precompiled Headers**: Include heavy templates in PCH')
-report.append('2. **Template Specialization**: Replace general templates with specialized versions')
-report.append('3. **Template Depth Reduction**: Simplify template hierarchies')
-report.append('')
-report.append('### Long Term')
-report.append('1. **Architectural Review**: Evaluate necessity of deep template metaprogramming')
-report.append('2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations')
-report.append('3. **Build Caching**: Distributed build cache for template instantiations')
-report.append('')
-report.append('## Detailed Statistics')
-report.append('')
-report.append(f'- **Total Unique Templates:** {len(sorted_templates)}')
-report.append(f'- **Total Instantiations:** {total_inst:,}')
-if total_inst > 0:
-    report.append(f'- **Average Instantiation Time:** {total_template_time/total_inst:.3f}ms')
+# Prepare templates by time with calculated fields
+templates_by_time = []
+for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['total_dur'], reverse=True):
+    templates_by_time.append((name, {
+        'count': stats['count'],
+        'total_dur': stats['total_dur'],
+        'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0,
+        'pct': 100 * stats['total_dur'] / total_template_time if total_template_time > 0 else 0
+    }))
+
+# Prepare templates by count
+templates_by_count = []
+for name, stats in sorted(template_stats.items(), key=lambda x: x[1]['count'], reverse=True):
+    templates_by_count.append((name, {
+        'count': stats['count'],
+        'total_dur': stats['total_dur'],
+        'avg': stats['total_dur'] / stats['count'] if stats['count'] > 0 else 0
+    }))
+
+# Prepare top individual instantiations with friendly type names
+for inst in top_individual:
+    inst['inst_type'] = 'Func' if inst['type'] == 'InstantiateFunction' else 'Class'
+
+# Calculate additional metrics
+median_count = 0
 if len(template_stats) > 0:
     median_count = sorted([s["count"] for s in template_stats.values()])[len(template_stats)//2]
-    report.append(f'- **Median Template Family Count:** {median_count}')
-report.append('')
-report.append('---')
-report.append('')
-report.append(f'*Report generated using Clang -ftime-trace with {granularity}µs granularity*')
-report.append(f'*Analysis tool: ck-build-analysis*')
+
+top10_pct = 0
+if len(templates_by_time) >= 10:
+    top10_pct = 100 * sum(s[1]["total_dur"] for s in templates_by_time[:10]) / total_template_time
+
+print('Rendering report with Jinja2...')
+# Set up Jinja2 environment with custom filters
+env = Environment(loader=FileSystemLoader(template_dir))
+
+def format_number(value):
+    """Format number with thousand separators"""
+    return f'{value:,}'
+
+def truncate(value, length):
+    """Truncate string to length with ellipsis"""
+    if len(value) > length:
+        return value[:length-3] + '...'
+    return value
+
+def pad(value, length):
+    """Pad string to specified length"""
+    return f'{value:<{length}}'
+
+env.filters['format_number'] = format_number
+env.filters['truncate'] = truncate
+env.filters['pad'] = pad
+
+# Load and render template
+template = env.get_template('build_analysis_report.md.jinja')
+report_content = template.render(
+    timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    target=target,
+    granularity=granularity,
+    build_time=build_time,
+    trace_time_sec=f'{total_trace_time/1000:.1f}',
+    template_time_sec=f'{total_template_time/1000:.1f}',
+    template_pct=f'{100*total_template_time/total_trace_time:.1f}',
+    total_events=total_events,
+    total_instantiations=total_inst,
+    unique_families=len(template_stats),
+    total_trace_time=total_trace_time,
+    total_template_time=total_template_time,
+    phases=sorted_phases,
+    top_individual=top_individual,
+    templates_by_time=templates_by_time,
+    templates_by_count=templates_by_count,
+    median_count=median_count,
+    top10_pct=f'{top10_pct:.1f}'
+)
 
 with open(output_file, 'w') as f:
-    f.write('\n'.join(report))
+    f.write(report_content)
 
 print(f'Report generated: {output_file}')
-print(f'Total lines: {len(report)}')
+print(f'Report size: {len(report_content)} bytes')
 PYSCRIPT
 
-# Copy analysis script to container and run it
+# Copy analysis script and templates to container
 docker cp "${ANALYSIS_SCRIPT}" "${CONTAINER_NAME}:/tmp/analyze.py"
+docker cp "${SCRIPT_DIR}/templates" "${CONTAINER_NAME}:/tmp/ck_build_analysis_templates"
 
 docker exec "${CONTAINER_NAME}" python3 /tmp/analyze.py \
     "${TRACE_FILE}" \
     "/workspace/${OUTPUT_FILE}" \
     "${TARGET}" \
     "${GRANULARITY}" \
-    "${BUILD_TIME}"
+    "${BUILD_TIME}" \
+    "/tmp/ck_build_analysis_templates"
 
 # Copy report back to host
 docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPUT_FILE}"
@@ -360,6 +336,7 @@ docker cp "${CONTAINER_NAME}:/workspace/${OUTPUT_FILE}" "${PROJECT_ROOT}/${OUTPU
 # Cleanup
 rm -f "${ANALYSIS_SCRIPT}"
 docker exec "${CONTAINER_NAME}" rm -f /tmp/analyze.py
+docker exec "${CONTAINER_NAME}" rm -rf /tmp/ck_build_analysis_templates
 
 echo ""
 echo "═══════════════════════════════════════════════════════════════"
diff --git a/.claude/skills/templates/build_analysis_report.md.jinja b/.claude/skills/templates/build_analysis_report.md.jinja
new file mode 100644
index 0000000000..b6c4b2bbf5
--- /dev/null
+++ b/.claude/skills/templates/build_analysis_report.md.jinja
@@ -0,0 +1,95 @@
+# Composable Kernel Build Time Analysis Report
+
+**Generated:** {{ timestamp }}
+**Target:** {{ target }}
+**Granularity:** {{ granularity }}µs
+
+## Executive Summary
+
+- **Wall Clock Time:** {{ build_time }} seconds
+- **Trace Time:** {{ trace_time_sec }} seconds
+- **Template Instantiation Time:** {{ template_time_sec }} seconds ({{ template_pct }}% of trace)
+- **Total Events Captured:** {{ total_events|format_number }}
+- **Total Template Instantiations:** {{ total_instantiations|format_number }}
+- **Unique Template Families:** {{ unique_families }}
+
+## Compilation Phase Breakdown
+
+| Phase | Time (ms) | Time (s) | % of Total |
+|-------|-----------|----------|------------|
+{% for phase, dur in phases[:20] -%}
+| {{ phase|pad(40) }} | {{ "%9.2f"|format(dur) }} | {{ "%8.2f"|format(dur/1000) }} | {{ "%9.1f"|format(100 * dur / total_trace_time) }}% |
+{% endfor %}
+
+## Top 30 Most Expensive Individual Instantiations
+
+| Rank | Template | Type | Time (ms) |
+|------|----------|------|-----------|
+{% for inst in top_individual[:30] -%}
+| {{ "%4d"|format(loop.index) }} | {{ inst.detail|truncate(70) }} | {{ inst.inst_type|pad(5) }} | {{ "%9.2f"|format(inst.dur) }} |
+{% endfor %}
+
+## Template Families by Total Time (Top 50)
+
+| Rank | Template Family | Count | Total (ms) | Avg (ms) | % of Total |
+|------|-----------------|-------|------------|----------|------------|
+{% for name, stats in templates_by_time[:50] -%}
+| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} | {{ "%9.1f"|format(stats.pct) }}% |
+{% endfor %}
+
+## Template Families by Instantiation Count (Top 50)
+
+| Rank | Template Family | Count | Total (ms) | Avg (ms) |
+|------|-----------------|-------|------------|----------|
+{% for name, stats in templates_by_count[:50] -%}
+| {{ "%4d"|format(loop.index) }} | {{ name|truncate(43)|pad(43) }} | {{ "%5d"|format(stats.count) }} | {{ "%10.2f"|format(stats.total_dur) }} | {{ "%8.2f"|format(stats.avg) }} |
+{% endfor %}
+
+## Key Insights
+
+### 1. Template Instantiation Impact
+- Template instantiation accounts for {{ template_pct }}% of total trace time
+{% if unique_families >= 10 -%}
+- Top 10 template families account for {{ top10_pct }}% of instantiation time
+{% endif %}
+
+### 2. Most Expensive Templates
+{% if templates_by_time|length > 0 -%}
+- **{{ templates_by_time[0][0] }}**: {{ templates_by_time[0][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[0][1].total_dur/1000) }}s total
+{% endif -%}
+{% if templates_by_time|length > 1 -%}
+- **{{ templates_by_time[1][0] }}**: {{ templates_by_time[1][1].count|format_number }} instantiations, {{ "%.2f"|format(templates_by_time[1][1].avg) }}ms average
+{% endif %}
+
+## Optimization Recommendations
+
+### Short Term
+1. **Focus on High-Impact Templates**: Address top 10 families first
+2. **Explicit Template Instantiation**: Pre-instantiate common configurations
+3. **Extern Templates**: Mark frequently-used templates as extern in headers
+
+### Medium Term
+1. **Precompiled Headers**: Include heavy templates in PCH
+2. **Template Specialization**: Replace general templates with specialized versions
+3. **Template Depth Reduction**: Simplify template hierarchies
+
+### Long Term
+1. **Architectural Review**: Evaluate necessity of deep template metaprogramming
+2. **C++20 Concepts**: Earlier constraint checking, fewer instantiations
+3. **Build Caching**: Distributed build cache for template instantiations
+
+## Detailed Statistics
+
+- **Total Unique Templates:** {{ unique_families }}
+- **Total Instantiations:** {{ total_instantiations|format_number }}
+{% if total_instantiations > 0 -%}
+- **Average Instantiation Time:** {{ "%.3f"|format(total_template_time/total_instantiations) }}ms
+{% endif -%}
+{% if unique_families > 0 -%}
+- **Median Template Family Count:** {{ median_count }}
+{% endif %}
+
+---
+
+*Report generated using Clang -ftime-trace with {{ granularity }}µs granularity*
+*Analysis tool: ck-build-analysis*