diff --git a/script/analyze_build/docs/CHROME_TRACE_EXPORT.md b/script/analyze_build/docs/CHROME_TRACE_EXPORT.md
new file mode 100644
index 0000000000..de079f6b3a
--- /dev/null
+++ b/script/analyze_build/docs/CHROME_TRACE_EXPORT.md
@@ -0,0 +1,960 @@
+# Chrome Trace Export for Cross-Validation
+
+**Status**: Design Document
+**Author**: Build Analysis Team
+**Date**: January 2026
+**Version**: 1.0
+
+## Executive Summary
+
+This document proposes adding Chrome Trace Event Format export capabilities to the `analyze_build` library to enable cross-validation with the existing `ninja_json_converter.py` tool. The two tools serve complementary purposes and this enhancement will allow verification of data consistency between them.
+
+## Background
+
+### Current State: Two Complementary Tools
+
+The project currently has two distinct build analysis tools:
+
+#### 1. `ninja_json_converter.py` - Build System Monitoring
+- **Purpose**: Monitor build-level parallelism and efficiency
+- **Primary Users**: Build engineers, CI/CD optimization teams
+- **Key Metrics**: Worker utilization, critical path, slow compilation units
+- **Output Format**: Chrome Trace Event Format (JSON)
+- **Granularity**: File-level (compilation units)
+- **Visualization**: Perfetto / Chrome Tracing UI
+- **Use Cases**:
+ - Is our build sharding efficient?
+ - Which files are compilation bottlenecks?
+ - How well are we utilizing available CPU cores?
+ - What's the critical path in our build?
+
+#### 2. `analyze_build` Library - Compiler Performance Analysis
+- **Purpose**: Deep analysis of C++ template metaprogramming costs
+- **Primary Users**: C++ developers, library maintainers, performance engineers
+- **Key Metrics**: Template instantiation times, template relationships, compiler event breakdown
+- **Output Format**: Pandas DataFrames for statistical analysis
+- **Granularity**: Template-level and compiler event-level (within compilation)
+- **Visualization**: Jupyter notebooks with statistical analysis
+- **Use Cases**:
+ - Which templates are most expensive to instantiate?
+ - What are the template dependency relationships?
+ - How can we optimize our metaprogramming patterns?
+ - How can we measure improved build times with better metaprogramming?
+ - What percentage of build time is template instantiation?
+
+### The Problem: Need for Cross-Validation
+
+Currently, these tools operate independently with no mechanism to verify consistency. This creates several challenges:
+
+1. **Data Accuracy**: No way to verify both tools are parsing the same underlying data correctly
+2. **Discrepancy Detection**: When numbers differ, unclear which tool is correct
+3. **Cross-Referencing**: Difficult to correlate findings (e.g., "slow file in ninja" vs "high template time in analyzer")
+4. **Debugging**: Hard to diagnose when tools report different build times
+5. **Trust**: Users may question which tool's numbers to believe
+
+## Goals and Non-Goals
+
+### Primary Goals
+
+1. **Enable Cross-Validation**: Export analyze_build data to Chrome Trace format for comparison with ninja_json_converter
+2. **Verify Consistency**: Provide utilities to compare outputs and identify discrepancies
+3. **Sanity Checking**: Quick visual verification in Perfetto that data looks correct
+4. **Cross-Reference Findings**: Correlate slow files with expensive templates
+
+### Secondary Goals
+
+1. **Template Event Visualization**: Optionally export template instantiation events as additional trace layer
+2. **Debugging Support**: Help diagnose when tools report different results
+3. **Documentation**: Clear workflow for validation process
+
+### Explicit Non-Goals
+
+1. **Not Replacing ninja_json_converter**: The tools serve different purposes and both should continue to exist
+2. **Not Full-Featured Visualization**: analyze_build focuses on statistical analysis, not interactive timelines
+3. **Not Advanced Timeline Features**: Keep it simple - just export for validation
+4. **Not Multi-Build Comparison**: ninja_json_converter already handles this well
+
+## Technical Design
+
+### Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ analyze_build Library │
+├─────────────────────────────────────────────────────────────┤
+│ │
+│ ┌──────────────┐ ┌──────────────┐ │
+│ │ NinjaParser │─────▶│ builds_df │ │
+│ └──────────────┘ └──────┬───────┘ │
+│ │ │
+│ ┌──────────────┐ ┌──────▼───────┐ │
+│ │ TraceParser │─────▶│ events_df │ │
+│ └──────────────┘ └──────┬───────┘ │
+│ │ │
+│ ┌──────▼───────────┐ │
+│ │ ChromeTraceExporter│ │
+│ └──────┬───────────┘ │
+│ │ │
+│ ┌──────▼───────────┐ │
+│ │ trace_events │ │
+│ │ (Chrome Format) │ │
+│ └──────┬───────────┘ │
+│ │ │
+└───────────────────────────────┼──────────────────────────────┘
+ │
+ ┌───────────▼────────────┐
+ │ Validation Utilities │
+ └───────────┬────────────┘
+ │
+ ┌───────────▼────────────┐
+ │ ninja_json_converter │
+ │ output │
+ └────────────────────────┘
+```
+
+### New Module: `trace_analysis/chrome_trace.py`
+
+```python
+"""
+Chrome Trace Event Format export for cross-validation.
+
+Exports trace analysis data to Chrome Trace Event Format compatible
+with ninja_json_converter.py output for validation purposes.
+"""
+
+from typing import Dict, List, Optional, Any
+import pandas as pd
+
+
+class ChromeTraceExporter:
+ """Export trace analysis data to Chrome Trace Event Format."""
+
+ @staticmethod
+ def export_ninja_timeline(
+ builds_df: pd.DataFrame,
+ process_id: int = 1,
+ include_metadata: bool = True
+ ) -> Dict[str, Any]:
+ """
+ Export ninja build timeline to Chrome Trace format.
+
+ Creates trace events compatible with ninja_json_converter.py output
+ for cross-validation purposes.
+
+ Args:
+ builds_df: DataFrame with columns: target, start_ms, end_ms,
+ duration_ms, worker_id, (optional) category
+ process_id: Process ID for trace events (default: 1)
+ include_metadata: Include trace metadata (default: True)
+
+ Returns:
+ Dictionary in Chrome Trace Event Format:
+ {
+ 'traceEvents': [...],
+ 'displayTimeUnit': 'ms',
+ 'otherData': {...}
+ }
+
+ Example:
+ >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+ >>> with open('trace.json', 'w') as f:
+ ... json.dump(trace_data, f)
+ """
+
+ @staticmethod
+ def export_template_events(
+ instantiations_df: pd.DataFrame,
+ templates_df: pd.DataFrame,
+ builds_df: pd.DataFrame,
+ process_id: int = 1,
+ granularity_us: int = 50000
+ ) -> Dict[str, Any]:
+ """
+ Export template instantiation events as Chrome Trace layer.
+
+ Creates template-level trace events that can be overlaid on the
+ ninja build timeline for detailed compiler analysis.
+
+ Args:
+ instantiations_df: Template instantiation events
+ templates_df: Template definitions
+ builds_df: Ninja builds (for timing alignment)
+ process_id: Process ID for trace events
+ granularity_us: Minimum duration threshold in microseconds
+
+ Returns:
+ Chrome Trace Event Format dictionary with template events
+
+ Note:
+ Template events are aligned with ninja build timing and
+ filtered by granularity threshold to reduce trace size.
+ """
+
+ @staticmethod
+ def merge_traces(
+ ninja_trace: Dict[str, Any],
+ template_trace: Dict[str, Any]
+ ) -> Dict[str, Any]:
+ """
+ Merge ninja and template traces into single trace file.
+
+ Combines build-level and template-level events for unified
+ visualization in Perfetto.
+
+ Args:
+ ninja_trace: Ninja build timeline trace
+ template_trace: Template instantiation trace
+
+ Returns:
+ Merged trace with both event types
+ """
+```
+
+### New Module: `trace_analysis/validation.py`
+
+```python
+"""
+Validation utilities for cross-checking trace analysis tools.
+
+Compares outputs from analyze_build and ninja_json_converter to
+verify data consistency and identify discrepancies.
+"""
+
+from typing import Dict, List, Any, Optional
+import pandas as pd
+
+
+class TraceValidator:
+ """Validate consistency between trace analysis tools."""
+
+ @staticmethod
+ def compare_traces(
+ analyzer_trace: Dict[str, Any],
+ ninja_converter_trace: Dict[str, Any],
+ tolerance_ms: float = 1.0
+ ) -> Dict[str, Any]:
+ """
+ Compare Chrome Trace outputs from both tools.
+
+ Validates that analyze_build and ninja_json_converter produce
+ consistent results from the same underlying data.
+
+ Args:
+ analyzer_trace: Trace from ChromeTraceExporter
+ ninja_converter_trace: Trace from ninja_json_converter.py
+ tolerance_ms: Acceptable time difference in milliseconds
+
+ Returns:
+ Validation report:
+ {
+ 'total_time_match': bool,
+ 'total_time_diff_ms': float,
+ 'event_count_match': bool,
+ 'event_count_diff': int,
+ 'file_discrepancies': [
+ {
+ 'file': str,
+ 'analyzer_ms': float,
+ 'ninja_ms': float,
+ 'diff_ms': float,
+ 'diff_pct': float
+ }
+ ],
+ 'summary': str
+ }
+ """
+
+ @staticmethod
+ def validate_ninja_log_parsing(
+ builds_df: pd.DataFrame,
+ ninja_log_path: str
+ ) -> Dict[str, Any]:
+ """
+ Validate that NinjaLogParser correctly parsed .ninja_log.
+
+ Cross-checks parsed DataFrame against raw .ninja_log file
+ to ensure no data loss or corruption.
+
+ Args:
+ builds_df: Parsed builds DataFrame
+ ninja_log_path: Path to original .ninja_log file
+
+ Returns:
+ Validation report with any parsing issues
+ """
+
+ @staticmethod
+ def generate_validation_report(
+ validation_results: Dict[str, Any],
+ output_path: Optional[str] = None
+ ) -> str:
+ """
+ Generate human-readable validation report.
+
+ Creates formatted report of validation results for review.
+
+ Args:
+ validation_results: Results from compare_traces()
+ output_path: Optional path to save report
+
+ Returns:
+ Formatted report string
+ """
+```
+
+### Data Flow
+
+```
+1. Parse .ninja_log
+ └─> NinjaLogParser.parse() -> builds_df
+
+2. Export to Chrome Trace
+ └─> ChromeTraceExporter.export_ninja_timeline(builds_df) -> trace_data
+
+3. Save trace file
+ └─> json.dump(trace_data, 'analyzer_trace.json')
+
+4. Generate ninja_json_converter trace (separately)
+ └─> python ninja_json_converter.py .ninja_log -o ninja_trace.json
+
+5. Validate consistency
+ └─> TraceValidator.compare_traces(analyzer_trace, ninja_trace) -> report
+
+6. Review discrepancies
+ └─> TraceValidator.generate_validation_report(report)
+```
+
+## Chrome Trace Event Format Specification
+
+### Event Structure
+
+Each trace event follows the Chrome Trace Event Format:
+
+```json
+{
+ "name": "target_name.o",
+ "cat": "compile",
+ "ph": "X",
+ "ts": 1234567890,
+ "dur": 5000000,
+ "pid": 1,
+ "tid": 3,
+ "args": {
+ "output": "target_name.o",
+ "duration_ms": 5000,
+ "cmd_hash": "abc123"
+ }
+}
+```
+
+**Field Descriptions:**
+- `name`: Target name (file being built)
+- `cat`: Category (compile, link_shared, link_executable, archive, test, other)
+- `ph`: Phase ("X" for complete events)
+- `ts`: Timestamp in microseconds
+- `dur`: Duration in microseconds
+- `pid`: Process ID (1 for ninja builds)
+- `tid`: Thread ID (worker ID)
+- `args`: Additional metadata
+
+### Trace File Structure
+
+```json
+{
+ "traceEvents": [
+ { /* event 1 */ },
+ { /* event 2 */ },
+ ...
+ ],
+ "displayTimeUnit": "ms",
+ "otherData": {
+ "version": "1.0",
+ "generator": "trace_analysis",
+ "source": "analyze_build"
+ }
+}
+```
+
+### Compatibility with ninja_json_converter
+
+The export format must be **byte-for-byte compatible** with ninja_json_converter output for the same input data, with these exceptions:
+
+**Acceptable Differences:**
+- `otherData.generator`: Different tool name
+- Event ordering: May differ if timestamps are identical
+- Floating point precision: ±0.001ms acceptable
+
+**Must Match Exactly:**
+- Total build time
+- Per-file durations (within tolerance)
+- Worker assignments
+- Event counts
+- Category assignments
+
+## Validation Strategy
+
+### Validation Checks
+
+1. **Total Build Time**
+ - Sum of all event durations
+ - Should match within ±1ms (rounding tolerance)
+
+2. **Event Count**
+ - Number of trace events
+ - Should match exactly
+
+3. **Per-File Duration**
+ - Duration for each compilation unit
+ - Should match within ±1ms per file
+
+4. **Worker Assignment**
+ - Thread ID (worker) for each event
+ - Should match exactly (deterministic algorithm)
+
+5. **Category Assignment**
+ - Event category based on file extension
+ - Should match exactly
+
+### Expected Discrepancies
+
+Some differences are expected and acceptable:
+
+1. **Timestamp Precision**: Microsecond rounding differences
+2. **Event Ordering**: When timestamps are identical
+3. **Metadata Fields**: Different tool names, versions
+4. **Floating Point**: Minor precision differences (< 0.001ms)
+
+### Validation Workflow
+
+```python
+# 1. Generate trace from analyze_build
+from trace_analysis import NinjaLogParser, ChromeTraceExporter
+import json
+
+builds = NinjaLogParser.parse(Path('.ninja_log'))
+builds_df = NinjaLogParser.to_dataframe(builds)
+analyzer_trace = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+with open('analyzer_trace.json', 'w') as f:
+ json.dump(analyzer_trace, f)
+
+# 2. Generate trace from ninja_json_converter (shell)
+# $ python script/ninja_json_converter.py .ninja_log -o ninja_trace.json
+
+# 3. Load both traces
+with open('ninja_trace.json') as f:
+ ninja_trace = json.load(f)
+
+# 4. Validate
+from trace_analysis import TraceValidator
+
+report = TraceValidator.compare_traces(analyzer_trace, ninja_trace)
+
+# 5. Review results
+print(TraceValidator.generate_validation_report(report))
+```
+
+### Validation Report Format
+
+```
+=== Trace Validation Report ===
+
+Overall Status: PASS / FAIL
+
+Build Statistics:
+ Total Events: 1,234 (analyzer) vs 1,234 (ninja) ✓
+ Total Time: 123.456s (analyzer) vs 123.457s (ninja) ✓ (diff: 0.001s)
+
+Worker Assignment:
+ Match Rate: 100% (1,234/1,234 events) ✓
+
+Per-File Duration:
+ Files Checked: 1,234
+ Exact Matches: 1,230 (99.7%)
+ Within Tolerance: 4 (0.3%)
+ Outside Tolerance: 0 (0.0%) ✓
+
+Discrepancies:
+ file1.o: 1234ms (analyzer) vs 1235ms (ninja) - diff: 1ms (0.08%)
+ file2.o: 5678ms (analyzer) vs 5677ms (ninja) - diff: 1ms (0.02%)
+
+Conclusion: Tools are consistent within acceptable tolerance.
+```
+
+## Implementation Plan
+
+### Phase 1: Basic Export (Week 1)
+
+**Deliverables:**
+- `trace_analysis/chrome_trace.py` with `export_ninja_timeline()`
+- Unit tests for Chrome Trace format
+- Integration test comparing with ninja_json_converter
+
+**Tasks:**
+- [ ] Implement ChromeTraceExporter class
+- [ ] Add event categorization logic
+- [ ] Write unit tests for event generation
+- [ ] Test with sample .ninja_log files
+- [ ] Verify format matches ninja_json_converter exactly
+
+**Success Criteria:**
+- Exports valid Chrome Trace JSON
+- Loads correctly in Perfetto
+- Matches ninja_json_converter output for same input
+
+### Phase 2: Validation Utilities (Week 1-2)
+
+**Deliverables:**
+- `trace_analysis/validation.py` with comparison utilities
+- Validation report generator
+- Documentation of validation workflow
+
+**Tasks:**
+- [ ] Implement TraceValidator class
+- [ ] Add comparison algorithms
+- [ ] Create validation report formatter
+- [ ] Write tests for validation logic
+- [ ] Document expected discrepancies
+
+**Success Criteria:**
+- Accurately identifies discrepancies
+- Generates clear validation reports
+- Handles edge cases gracefully
+
+### Phase 3: Template Event Export (Week 2)
+
+**Deliverables:**
+- Template event export in `chrome_trace.py`
+- Merged trace generation
+- Examples in notebook
+
+**Tasks:**
+- [ ] Implement `export_template_events()`
+- [ ] Add timing alignment logic
+- [ ] Implement granularity filtering
+- [ ] Add merge functionality
+- [ ] Test with real -ftime-trace data
+
+**Success Criteria:**
+- Template events align with ninja timeline
+- Granularity filtering works correctly
+- Merged traces load in Perfetto
+
+### Phase 4: Documentation & Examples (Week 2-3)
+
+**Deliverables:**
+- Updated README with validation workflow
+- Notebook section demonstrating export
+- API documentation
+- Validation guide
+
+**Tasks:**
+- [ ] Add notebook section for Chrome Trace export
+- [ ] Document validation workflow
+- [ ] Create troubleshooting guide
+- [ ] Add API documentation
+- [ ] Write migration guide for ninja_json_converter users
+
+**Success Criteria:**
+- Clear documentation of validation process
+- Working examples in notebook
+- Users can successfully validate traces
+
+## Testing Strategy
+
+### Unit Tests
+
+```python
+# test_chrome_trace.py
+
+def test_export_ninja_timeline_format():
+ """Verify Chrome Trace format is valid."""
+
+def test_export_ninja_timeline_compatibility():
+ """Verify compatibility with ninja_json_converter."""
+
+def test_event_categorization():
+ """Verify file extension -> category mapping."""
+
+def test_worker_assignment():
+ """Verify worker IDs match ninja_json_converter."""
+```
+
+### Integration Tests
+
+```python
+# test_validation.py
+
+def test_compare_identical_traces():
+ """Validation passes for identical traces."""
+
+def test_detect_discrepancies():
+ """Validation detects timing differences."""
+
+def test_tolerance_handling():
+ """Small differences within tolerance pass."""
+```
+
+### Validation Tests
+
+```python
+# test_cross_validation.py
+
+def test_real_ninja_log():
+ """Compare with actual ninja_json_converter output."""
+
+def test_large_build():
+ """Handle large builds (1000+ files)."""
+
+def test_incremental_build():
+ """Handle incremental build scenarios."""
+```
+
+## Usage Examples
+
+### Basic Export
+
+```python
+from pathlib import Path
+from trace_analysis import NinjaLogParser, ChromeTraceExporter
+import json
+
+# Parse ninja log
+builds = NinjaLogParser.parse(Path('build/.ninja_log'))
+builds_df = NinjaLogParser.to_dataframe(builds)
+
+# Export to Chrome Trace
+trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+# Save for Perfetto
+with open('build_trace.json', 'w') as f:
+ json.dump(trace_data, f)
+
+print("Open build_trace.json in chrome://tracing or https://ui.perfetto.dev")
+```
+
+### Cross-Validation
+
+```python
+from trace_analysis import ChromeTraceExporter, TraceValidator
+import json
+import subprocess
+
+# Generate trace from analyze_build
+analyzer_trace = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+# Generate trace from ninja_json_converter
+subprocess.run([
+ 'python', 'script/ninja_json_converter.py',
+ 'build/.ninja_log',
+ '-o', 'ninja_trace.json'
+])
+
+# Load ninja_json_converter output
+with open('ninja_trace.json') as f:
+ ninja_trace = json.load(f)
+
+# Validate
+report = TraceValidator.compare_traces(analyzer_trace, ninja_trace)
+
+# Print report
+print(TraceValidator.generate_validation_report(report))
+
+# Check if validation passed
+if report['total_time_match'] and report['event_count_match']:
+ print("✓ Validation PASSED - Tools are consistent")
+else:
+ print("✗ Validation FAILED - Discrepancies found")
+ for disc in report['file_discrepancies']:
+ print(f" {disc['file']}: {disc['diff_ms']}ms difference")
+```
+
+### Template Event Export
+
+```python
+from trace_analysis import (
+ TraceParser, TraceTransformer,
+ ChromeTraceExporter, find_trace_files
+)
+
+# Parse -ftime-trace files
+trace_files = find_trace_files(Path('build'))
+all_events = []
+all_instantiations = []
+
+for trace_file in trace_files:
+ events = TraceParser.parse(trace_file)
+ schema = TraceTransformer.to_enhanced_schema(events, file_id=0)
+ all_instantiations.append(schema['instantiations'])
+
+instantiations_df = pd.concat(all_instantiations, ignore_index=True)
+
+# Export template events
+template_trace = ChromeTraceExporter.export_template_events(
+ instantiations_df,
+ templates_df,
+ builds_df,
+ granularity_us=50000 # Only events > 50ms
+)
+
+# Merge with ninja timeline
+merged_trace = ChromeTraceExporter.merge_traces(
+ ninja_trace,
+ template_trace
+)
+
+# Save merged trace
+with open('merged_trace.json', 'w') as f:
+ json.dump(merged_trace, f)
+```
+
+### Notebook Integration
+
+```python
+# In comprehensive_example.ipynb
+
+## Chrome Trace Export for Validation
+
+# Export ninja timeline
+from trace_analysis import ChromeTraceExporter
+import json
+
+trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+# Save trace
+with open('../data/analyzer_trace.json', 'w') as f:
+ json.dump(trace_data, f, indent=2)
+
+print(f"Exported {len(trace_data['traceEvents'])} events")
+print(f"Total build time: {sum(e['dur'] for e in trace_data['traceEvents']) / 1e6:.2f}s")
+
+# Validate against ninja_json_converter
+# (Assuming ninja_trace.json was generated separately)
+with open('../data/ninja_trace.json') as f:
+ ninja_trace = json.load(f)
+
+from trace_analysis import TraceValidator
+
+report = TraceValidator.compare_traces(trace_data, ninja_trace)
+print(TraceValidator.generate_validation_report(report))
+```
+
+## Open Questions
+
+### Critical Questions
+
+1. **Data Consistency**
+ - Q: Do you currently see discrepancies between the tools?
+ - Q: What tolerance is acceptable? (±1ms suggested)
+ - Q: Are there known sources of differences?
+
+2. **Validation Workflow**
+ - Q: How often do you need to cross-validate?
+ - Q: Should this be automated in CI?
+ - Q: What triggers a validation run?
+
+3. **Template Event Export**
+ - Q: Should template events be in same file as ninja events?
+ - Q: Or separate files for different analysis?
+ - Q: Priority: High, Medium, or Low?
+
+### Technical Questions
+
+4. **Output Format**
+ - Q: Must we match ninja_json_converter format exactly?
+ - Q: Or can we use enhanced format with metadata?
+ - Q: Is backward compatibility required?
+
+5. **Performance**
+ - Q: What's the largest build to support?
+ - Q: Number of targets? (hundreds, thousands, tens of thousands?)
+ - Q: Should we implement sampling for huge builds?
+
+## Success Metrics
+
+### Functional Metrics
+
+- ✅ Exports valid Chrome Trace JSON
+- ✅ Loads correctly in Perfetto
+- ✅ Matches ninja_json_converter output (within tolerance)
+- ✅ Validation detects discrepancies accurately
+- ✅ Clear validation reports
+
+### Quality Metrics
+
+- ✅ 100% unit test coverage for new modules
+- ✅ Integration tests with real data pass
+- ✅ Documentation complete and clear
+- ✅ Examples work in notebook
+
+### Performance Metrics
+
+- ✅ Export completes in < 1s for 1000 files
+- ✅ Validation completes in < 5s for 1000 files
+- ✅ Memory usage < 100MB for typical builds
+
+## Future Enhancements
+
+### Potential Phase 2 Features
+
+1. **Automated Validation in CI**
+ - Run validation on every build
+ - Fail CI if discrepancies exceed threshold
+ - Track validation metrics over time
+
+2. **Differential Analysis**
+ - Compare traces from different builds
+ - Identify performance regressions
+ - Track optimization progress
+
+3. **Enhanced Visualization**
+ - Plotly timeline charts in notebooks
+ - Interactive exploration of discrepancies
+ - Side-by-side comparison views
+
+4. **Template Optimization Recommendations**
+ - Correlate slow files with expensive templates
+ - Suggest optimization targets
+ - Estimate potential improvements
+
+## References
+
+- [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview)
+- [Perfetto UI](https://ui.perfetto.dev)
+- [Clang -ftime-trace Documentation](https://releases.llvm.org/11.0.0/tools/clang/docs/ClangCommandLineReference.html#cmdoption-clang-ftime-trace)
+- [Ninja Build System](https://ninja-build.org/)
+
+## Appendix A: Chrome Trace Event Format Details
+
+### Complete Event Structure
+
+```json
+{
+ "name": "event_name",
+ "cat": "category",
+ "ph": "X",
+ "ts": 1234567890,
+ "dur": 5000000,
+ "pid": 1,
+ "tid": 3,
+ "args": {
+ "custom_field": "value"
+ }
+}
+```
+
+### Phase Types
+
+- `X`: Complete event (has duration)
+- `B`: Begin event
+- `E`: End event
+- `i`: Instant event
+- `M`: Metadata event
+
+For build traces, we use `X` (complete events) exclusively.
+
+### Category Conventions
+
+Standard categories for build events:
+
+- `compile`: Compilation of source files (.o, .obj)
+- `link_shared`: Shared library linking (.so, .dll, .dylib)
+- `link_executable`: Executable linking (.exe, .out)
+- `archive`: Static library creation (.a, .lib)
+- `test`: Test execution
+- `other`: Other build steps
+
+## Appendix B: Validation Algorithm
+
+### Comparison Algorithm
+
+```python
+def compare_events(event1, event2, tolerance_ms=1.0):
+ """Compare two trace events for equivalence."""
+
+ # Must match exactly
+ if event1['name'] != event2['name']:
+ return False, "Name mismatch"
+ if event1['tid'] != event2['tid']:
+ return False, "Worker ID mismatch"
+ if event1['cat'] != event2['cat']:
+ return False, "Category mismatch"
+
+ # Must match within tolerance
+ dur1_ms = event1['dur'] / 1000
+ dur2_ms = event2['dur'] / 1000
+ diff_ms = abs(dur1_ms - dur2_ms)
+
+ if diff_ms > tolerance_ms:
+ return False, f"Duration mismatch: {diff_ms}ms"
+
+ return True, "Match"
+```
+
+### Discrepancy Categorization
+
+**Critical**: Must be fixed
+- Total time difference > 1%
+- Event count mismatch
+- Worker assignment errors
+
+**Warning**: Should investigate
+- Per-file duration > 1ms difference
+- Category mismatches
+- Timestamp ordering issues
+
+**Info**: Acceptable
+- Floating point precision differences
+- Metadata differences
+- Event ordering when timestamps identical
+
+## Appendix C: Migration Guide
+
+### For ninja_json_converter Users
+
+If you currently use `ninja_json_converter.py`, you can continue to do so. The new Chrome Trace export in `analyze_build` is complementary, not a replacement.
+
+**When to use ninja_json_converter:**
+- Quick build timeline visualization
+- Build system optimization
+- CI/CD monitoring
+- Multi-build comparison
+
+**When to use analyze_build Chrome Trace export:**
+- Cross-validation with template analysis
+- Verifying data consistency
+- Debugging discrepancies
+- Correlating build and template metrics
+
+**Using both together:**
+```bash
+# Generate trace from ninja_json_converter
+python script/ninja_json_converter.py build/.ninja_log -o ninja_trace.json
+
+# Generate trace from analyze_build
+python -c "
+from pathlib import Path
+from trace_analysis import NinjaLogParser, ChromeTraceExporter
+import json
+
+builds = NinjaLogParser.parse(Path('build/.ninja_log'))
+builds_df = NinjaLogParser.to_dataframe(builds)
+trace = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+with open('analyzer_trace.json', 'w') as f:
+ json.dump(trace, f)
+"
+
+# Compare
+python -c "
+from trace_analysis import TraceValidator
+import json
+
+with open('ninja_trace.json') as f:
+ ninja = json.load(f)
+with open('analyzer_trace.json') as f:
+ analyzer = json.load(f)
+
+report = TraceValidator.compare_traces(analyzer, ninja)
+print(TraceValidator.generate_validation_report(report))
+"
diff --git a/script/analyze_build/docs/PERFETTO_VISUALIZATION.md b/script/analyze_build/docs/PERFETTO_VISUALIZATION.md
new file mode 100644
index 0000000000..fb5eacd7e2
--- /dev/null
+++ b/script/analyze_build/docs/PERFETTO_VISUALIZATION.md
@@ -0,0 +1,231 @@
+# Perfetto Visualization Guide
+
+This guide shows how to visualize ninja build timelines in Perfetto UI using the `trace_analysis` library.
+
+## Quick Start
+
+### Command Line Usage
+
+```bash
+# Run the example script
+python examples/perfetto_visualization_example.py path/to/.ninja_log
+
+# This will:
+# 1. Parse the ninja log
+# 2. Assign workers for parallelism visualization
+# 3. Export to Chrome Trace format
+# 4. Save to build_trace.json
+```
+
+### Jupyter Notebook Usage
+
+```python
+from pathlib import Path
+from trace_analysis import NinjaLogParser, ChromeTraceExporter
+from trace_analysis.perfetto_display import display_perfetto, print_trace_summary
+
+# Parse ninja log
+builds = NinjaLogParser.parse(Path('build/.ninja_log'))
+builds_df = NinjaLogParser.to_dataframe(builds)
+builds_df = NinjaLogParser.assign_workers(builds_df)
+
+# Export to Chrome Trace format
+trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+# Print summary
+print_trace_summary(trace_data)
+
+# Display in Perfetto UI (embedded in notebook)
+display_perfetto(trace_data)
+
+# Or save to file for large traces
+from trace_analysis.perfetto_display import save_and_link
+save_and_link(trace_data, '../data/build_trace.json')
+```
+
+## What You Get
+
+The Chrome Trace export provides:
+
+- **Build Timeline**: Visual representation of when each target was built
+- **Parallelism Analysis**: See how many workers were active at any time
+- **Category Breakdown**: Targets categorized by type (compile, link, archive, etc.)
+- **Duration Analysis**: Identify slow compilation units
+- **Critical Path**: Understand build dependencies and bottlenecks
+
+## Viewing in Perfetto UI
+
+### Option 1: Embedded in Jupyter (Small Traces)
+
+For traces < 10MB, use `display_perfetto()` to embed directly in the notebook:
+
+```python
+display_perfetto(trace_data, height=600)
+```
+
+### Option 2: Manual Upload (Large Traces)
+
+For larger traces, save to file and upload manually:
+
+```python
+ChromeTraceExporter.export_to_file(trace_data, 'build_trace.json')
+```
+
+Then:
+1. Go to https://ui.perfetto.dev
+2. Click "Open trace file"
+3. Select your `build_trace.json`
+
+Or drag and drop the file directly into Perfetto UI.
+
+## DataFrame Schema
+
+The `builds_df` DataFrame has the following columns:
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `target` | str | Build target name (e.g., "obj/foo.o") |
+| `start_ms` | int64 | Start time in milliseconds since epoch |
+| `end_ms` | int64 | End time in milliseconds since epoch |
+| `duration_ms` | int32 | Build duration in milliseconds |
+| `cmd_hash` | str | Command hash from ninja |
+| `worker_id` | int16 | Assigned worker ID (0-based) |
+
+### Adding Category Column
+
+The Chrome Trace exporter automatically categorizes targets based on file extension:
+
+- `.o`, `.obj` → `compile`
+- `.a`, `.lib` → `archive`
+- `.so`, `.dll`, `.dylib` → `link_shared`
+- `.exe`, `.out` → `link_executable`
+- Contains "test" → `test`
+- Everything else → `other`
+
+## Chrome Trace Event Format
+
+Each build target is exported as a Chrome Trace event:
+
+```json
+{
+ "name": "obj/foo.o",
+ "cat": "compile",
+ "ph": "X",
+ "ts": 1234567890000,
+ "dur": 5000000,
+ "pid": 1,
+ "tid": 3,
+ "args": {
+ "output": "obj/foo.o",
+ "duration_ms": 5000,
+ "cmd_hash": "abc123"
+ }
+}
+```
+
+## Comparison with ninja_json_converter.py
+
+The `trace_analysis` library provides similar functionality to `ninja_json_converter.py` but with additional features:
+
+### Similarities
+- Both parse `.ninja_log` files
+- Both export to Chrome Trace Event Format
+- Both can be viewed in Perfetto UI
+
+### Differences
+
+| Feature | ninja_json_converter.py | trace_analysis |
+|---------|------------------------|----------------|
+| **Primary Use** | Quick build visualization | Integrated analysis workflow |
+| **Output** | Chrome Trace JSON only | DataFrames + Chrome Trace |
+| **Analysis** | External (Perfetto UI) | In-notebook with pandas |
+| **Template Data** | No | Yes (with -ftime-trace) |
+| **Worker Assignment** | Built-in algorithm | Same algorithm, exposed as DataFrame |
+| **Customization** | Command-line flags | Programmatic API |
+
+### When to Use Each
+
+**Use `ninja_json_converter.py` when:**
+- You just want a quick visualization
+- You're working from the command line
+- You don't need further analysis
+
+**Use `trace_analysis` when:**
+- You want to analyze build data with pandas
+- You're working in Jupyter notebooks
+- You want to correlate build times with template analysis
+- You need programmatic access to build data
+
+## Examples
+
+### Example 1: Find Slowest Builds
+
+```python
+# Get top 10 slowest builds
+slowest = builds_df.nlargest(10, 'duration_ms')
+print(slowest[['target', 'duration_ms', 'worker_id']])
+```
+
+### Example 2: Analyze Worker Utilization
+
+```python
+worker_stats = NinjaLogParser.compute_worker_stats(builds_df)
+print(worker_stats)
+```
+
+### Example 3: Category Breakdown
+
+```python
+from trace_analysis.perfetto_display import get_trace_summary
+
+summary = get_trace_summary(trace_data)
+print(f"Total events: {summary['event_count']}")
+print(f"Total duration: {summary['total_duration_s']:.2f}s")
+print(f"Workers: {summary['worker_count']}")
+print("\nBy category:")
+for cat, count in summary['categories'].items():
+ print(f" {cat}: {count} events")
+```
+
+### Example 4: Export with Custom Process ID
+
+```python
+# Useful when combining multiple build logs
+trace_data = ChromeTraceExporter.export_ninja_timeline(
+ builds_df,
+ process_id=2, # Use different PID for each log
+ include_metadata=True
+)
+```
+
+## Troubleshooting
+
+### Issue: Trace file too large for embedded display
+
+**Solution**: Use `save_and_link()` instead of `display_perfetto()`:
+
+```python
+save_and_link(trace_data, 'build_trace.json')
+```
+
+### Issue: Worker IDs all show as -1
+
+**Solution**: Make sure to call `assign_workers()`:
+
+```python
+builds_df = NinjaLogParser.assign_workers(builds_df)
+```
+
+### Issue: Import error for perfetto_display
+
+**Solution**: The perfetto display functions are in a separate module:
+
+```python
+from trace_analysis.perfetto_display import display_perfetto
+```
+
+## See Also
+
+- [CHROME_TRACE_EXPORT.md](CHROME_TRACE_EXPORT.md) - Full design document
+- [comprehensive_example.ipynb](../notebooks/comprehensive_example.ipynb) - Complete analysis workflow
+- [ninja_json_converter.py](../../ninja_json_converter.py) - Command-line alternative
diff --git a/script/analyze_build/examples/perfetto_visualization_example.py b/script/analyze_build/examples/perfetto_visualization_example.py
new file mode 100644
index 0000000000..b238c4d7a1
--- /dev/null
+++ b/script/analyze_build/examples/perfetto_visualization_example.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example: Visualizing Build Timeline in Perfetto UI
+
+This example demonstrates how to:
+1. Parse a ninja .ninja_log file
+2. Export to Chrome Trace format
+3. Display in Perfetto UI (for Jupyter notebooks)
+4. Save to file for manual upload
+
+Usage:
+ python perfetto_visualization_example.py path/to/.ninja_log
+"""
+
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from trace_analysis import NinjaLogParser, ChromeTraceExporter
+from trace_analysis.perfetto_display import (
+ print_trace_summary,
+)
+
+
+def main():
+ """Main example function."""
+ if len(sys.argv) < 2:
+ print("Usage: python perfetto_visualization_example.py path/to/.ninja_log")
+ sys.exit(1)
+
+ ninja_log_path = Path(sys.argv[1])
+
+ if not ninja_log_path.exists():
+ print(f"Error: {ninja_log_path} not found")
+ sys.exit(1)
+
+ print(f"Parsing {ninja_log_path}...")
+
+ # Step 1: Parse ninja log
+ builds = NinjaLogParser.parse(ninja_log_path)
+ builds_df = NinjaLogParser.to_dataframe(builds)
+
+ print(f"Found {len(builds_df):,} build targets")
+
+ # Step 2: Assign workers (for parallelism visualization)
+ builds_df = NinjaLogParser.assign_workers(builds_df)
+
+ print(f"Assigned {builds_df['worker_id'].max() + 1} workers")
+
+ # Step 3: Export to Chrome Trace format
+ trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+
+ print(f"\nGenerated {len(trace_data['traceEvents']):,} trace events")
+
+ # Step 4: Print summary
+ print_trace_summary(trace_data)
+
+ # Step 5: Save to file
+ output_path = ninja_log_path.parent / "build_trace.json"
+ ChromeTraceExporter.export_to_file(trace_data, str(output_path))
+
+ print(f"\n✓ Trace saved to: {output_path}")
+ print("\nTo view in Perfetto UI:")
+ print(" 1. Go to https://ui.perfetto.dev")
+ print(" 2. Click 'Open trace file'")
+ print(f" 3. Select: {output_path}")
+ print("\nOr drag and drop the file directly into Perfetto UI")
+
+ # For Jupyter notebook usage, you would use:
+ # display_perfetto(trace_data)
+ # or for large traces:
+ # save_and_link(trace_data, str(output_path))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/analyze_build/notebooks/comprehensive_example.ipynb b/script/analyze_build/notebooks/comprehensive_example.ipynb
index 491e373f07..5db3ebe0b3 100644
--- a/script/analyze_build/notebooks/comprehensive_example.ipynb
+++ b/script/analyze_build/notebooks/comprehensive_example.ipynb
@@ -105,10 +105,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Found 1,253 trace files\n",
+ "Found 1,279 trace files\n",
"\n",
"Using sample file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n",
- "File size: 11652.5 KB\n"
+ "File size: 10967.8 KB\n"
]
}
],
@@ -155,14 +155,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Parsed 15,110 events in 0.043s\n",
- "Transformed to Pandas tables in 1.115s\n",
+ "Parsed 14,271 events in 0.058s\n",
+ "Transformed to Pandas tables in 0.999s\n",
"\n",
"Pandas DataFrames:\n",
- " templates : 8,703 rows, 8.10 MB | template_id, template_name, full_signature, depth, arg_count\n",
- " instantiations : 9,838 rows, 0.31 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n",
- " template_args : 51,474 rows, 8.73 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n",
- " events : 15,110 rows, 0.53 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n"
+ " templates : 8,156 rows, 7.56 MB | template_id, template_name, full_signature, depth, arg_count\n",
+ " instantiations : 9,267 rows, 0.29 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n",
+ " template_args : 49,097 rows, 8.23 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n",
+ " events : 14,271 rows, 0.50 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n"
]
}
],
@@ -217,10 +217,10 @@
"text": [
"Compilation Unit Summary:\n",
" Trace file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n",
- " Trace file size: 11652.5 KB\n",
- " Start time: 2026-01-03 22:51:51.980489\n",
- " Total compilation time: 178.01s\n",
- " Total events: 15,110\n"
+ " Trace file size: 10967.8 KB\n",
+ " Start time: 2026-01-04 16:14:25.048921\n",
+ " Total compilation time: 162.17s\n",
+ " Total events: 14,271\n"
]
}
],
@@ -257,16 +257,16 @@
"-------------------------------------------------------------------\n",
"template_id int32 0.03 0.4%\n",
"template_name category 0.03 0.4%\n",
- "full_signature object 8.02 99.0%\n",
+ "full_signature object 7.49 99.0%\n",
"depth int8 0.01 0.1%\n",
"arg_count int8 0.01 0.1%\n",
"Index RangeIndex 0.00 0.0%\n",
"-------------------------------------------------------------------\n",
- "TOTAL 8.10 100.0%\n",
+ "TOTAL 7.56 100.0%\n",
"\n",
- "Total templates: 8,703\n",
- "CK templates: 6,331 (72.7%)\n",
- "Other templates: 2,372\n",
+ "Total templates: 8,156\n",
+ "CK templates: 6,063 (74.3%)\n",
+ "Other templates: 2,093\n",
"\n",
"Sample CK templates:\n"
]
@@ -301,26 +301,18 @@
" \n",
"
\n",
" \n",
- " | 45 | \n",
- " 45 | \n",
- " ck::Tuple | \n",
- " ck::Tuple<ck::integral_constant<int, 2>, _BitInt(4)> | \n",
- " 2 | \n",
- " 2 | \n",
- "
\n",
- " \n",
" | 46 | \n",
" 46 | \n",
- " ck::Tuple | \n",
- " ck::Tuple<ck::integral_constant<int, 16>, _BitInt(6)> | \n",
- " 2 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
+ " 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 47 | \n",
" 47 | \n",
- " ck::Tuple | \n",
- " ck::Tuple<float, float> | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 2> | \n",
" 1 | \n",
" 2 | \n",
"
\n",
@@ -328,7 +320,7 @@
" 48 | \n",
" 48 | \n",
" ck::vector_type | \n",
- " ck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
+ " ck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
" 1 | \n",
" 2 | \n",
" \n",
@@ -336,90 +328,98 @@
" 49 | \n",
" 49 | \n",
" ck::vector_type | \n",
- " ck::vector_type<float, 2> | \n",
+ " ck::vector_type<float, 4> | \n",
" 1 | \n",
" 2 | \n",
" \n",
" \n",
" | 50 | \n",
" 50 | \n",
- " ck::Tuple | \n",
- " ck::Tuple<float, float, float, float> | \n",
- " 1 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 51 | \n",
- " 51 | \n",
- " ck::vector_type | \n",
- " ck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 52 | \n",
- " 52 | \n",
- " ck::vector_type | \n",
- " ck::vector_type<float, 4> | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 53 | \n",
- " 53 | \n",
- " ck::Tuple | \n",
- " ck::Tuple<float, float, float, float, float, float, float, float> | \n",
- " 1 | \n",
- " 8 | \n",
- "
\n",
- " \n",
- " | 54 | \n",
- " 54 | \n",
" ck::detail::TupleElementKeyData | \n",
" ck::detail::TupleElementKeyData<ck::detail::TupleElementKey<3>, float __attr... | \n",
" 2 | \n",
" 2 | \n",
"
\n",
+ " \n",
+ " | 51 | \n",
+ " 51 | \n",
+ " ck::detail::TupleImpl | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3>, float __attribute__((ext_vec... | \n",
+ " 2 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " 52 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<float __attribute__((ext_vector_type(2))), float __attribute__((ex... | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " 53 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 8>::(unnamed union at /home/AMD/jshumway/composable_k... | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " 54 | \n",
+ " ck::vector_type | \n",
+ " ck::vector_type<float, 8> | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " 55 | \n",
+ " ck::Tuple | \n",
+ " ck::Tuple<float, float, float, float, float, float, float, float, float, flo... | \n",
+ " 1 | \n",
+ " 16 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
" template_id template_name \\\n",
- "45 45 ck::Tuple \n",
- "46 46 ck::Tuple \n",
- "47 47 ck::Tuple \n",
+ "46 46 ck::vector_type \n",
+ "47 47 ck::vector_type \n",
"48 48 ck::vector_type \n",
"49 49 ck::vector_type \n",
- "50 50 ck::Tuple \n",
- "51 51 ck::vector_type \n",
- "52 52 ck::vector_type \n",
- "53 53 ck::Tuple \n",
- "54 54 ck::detail::TupleElementKeyData \n",
+ "50 50 ck::detail::TupleElementKeyData \n",
+ "51 51 ck::detail::TupleImpl \n",
+ "52 52 ck::Tuple \n",
+ "53 53 ck::vector_type \n",
+ "54 54 ck::vector_type \n",
+ "55 55 ck::Tuple \n",
"\n",
" full_signature \\\n",
- "45 ck::Tuple, _BitInt(4)> \n",
- "46 ck::Tuple, _BitInt(6)> \n",
- "47 ck::Tuple \n",
- "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
- "49 ck::vector_type \n",
- "50 ck::Tuple \n",
- "51 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
- "52 ck::vector_type \n",
- "53 ck::Tuple \n",
- "54 ck::detail::TupleElementKeyData, float __attr... \n",
+ "46 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
+ "47 ck::vector_type \n",
+ "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n",
+ "49 ck::vector_type \n",
+ "50 ck::detail::TupleElementKeyData, float __attr... \n",
+ "51 ck::detail::TupleImpl, float __attribute__((ext_vec... \n",
+ "52 ck::Tuple::(unnamed union at /home/AMD/jshumway/composable_k... \n",
+ "54 ck::vector_type \n",
+ "55 ck::Tuple\n",
" \n",
" \n",
- " | 670 | \n",
+ " 620 | \n",
" ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
" 12 | \n",
- " 444.005 | \n",
- " 37.000417 | \n",
+ " 403.014 | \n",
+ " 33.584500 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 669 | \n",
+ " 619 | \n",
" ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
" 12 | \n",
- " 442.665 | \n",
- " 36.888750 | \n",
+ " 401.766 | \n",
+ " 33.480500 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 1411 | \n",
+ " 1305 | \n",
" ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10>::operator()<(lamb... | \n",
" 34 | \n",
- " 365.628 | \n",
- " 10.753765 | \n",
+ " 330.576 | \n",
+ " 9.722824 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 6424 | \n",
+ " 6125 | \n",
" ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>::operator... | \n",
" 2 | \n",
- " 346.742 | \n",
- " 173.371000 | \n",
+ " 318.451 | \n",
+ " 159.225500 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 4079 | \n",
+ " 3881 | \n",
" ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
" 4 | \n",
- " 330.928 | \n",
- " 82.732000 | \n",
+ " 303.042 | \n",
+ " 75.760500 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 4078 | \n",
+ " 3880 | \n",
" ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
" 4 | \n",
- " 330.380 | \n",
- " 82.595000 | \n",
+ " 302.517 | \n",
+ " 75.629250 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 4859 | \n",
+ " 4631 | \n",
" ck::transform_tensor_descriptor<ck::TensorDescriptor<ck::Tuple<ck::UnMerge<c... | \n",
" 1 | \n",
- " 327.756 | \n",
- " 327.756000 | \n",
+ " 301.674 | \n",
+ " 301.674000 | \n",
" 6 | \n",
"
\n",
" \n",
- " | 1470 | \n",
- " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
- " 6 | \n",
- " 311.490 | \n",
- " 51.915000 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1469 | \n",
- " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
- " 6 | \n",
- " 310.729 | \n",
- " 51.788167 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2362 | \n",
+ " 2217 | \n",
" ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
" 5 | \n",
- " 295.890 | \n",
- " 59.178000 | \n",
+ " 290.939 | \n",
+ " 58.187800 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2216 | \n",
+ " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... | \n",
+ " 5 | \n",
+ " 290.336 | \n",
+ " 58.067200 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1364 | \n",
+ " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
+ " 6 | \n",
+ " 284.086 | \n",
+ " 47.347667 | \n",
" 1 | \n",
"
\n",
" \n",
@@ -609,28 +609,28 @@
],
"text/plain": [
" signature \\\n",
- "670 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n",
- "669 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n",
- "1411 ck::detail::applier::operator()<(lamb... \n",
- "6424 ck::detail::applier::operator... \n",
- "4079 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n",
- "4078 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n",
- "4859 ck::transform_tensor_descriptor::operator()<(lamb... \n",
+ "6125 ck::detail::applier::operator... \n",
+ "3881 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n",
+ "3880 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n",
+ "4631 ck::transform_tensor_descriptor\n",
" 0 | \n",
" 1 | \n",
- " 1628 | \n",
- " 8510.435 | \n",
- " 5.227540 | \n",
+ " 1564 | \n",
+ " 7735.584 | \n",
+ " 4.946026 | \n",
" \n",
" \n",
" | 1 | \n",
" 2 | \n",
- " 1962 | \n",
- " 5433.074 | \n",
- " 2.769151 | \n",
+ " 1873 | \n",
+ " 4943.003 | \n",
+ " 2.639083 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
- " 1311 | \n",
- " 5042.420 | \n",
- " 3.846240 | \n",
+ " 1256 | \n",
+ " 4686.911 | \n",
+ " 3.731617 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
- " 2548 | \n",
- " 7730.989 | \n",
- " 3.034140 | \n",
+ " 2336 | \n",
+ " 6986.861 | \n",
+ " 2.990951 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
- " 2112 | \n",
- " 10514.270 | \n",
- " 4.978348 | \n",
+ " 1969 | \n",
+ " 9640.982 | \n",
+ " 4.896385 | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
- " 257 | \n",
- " 2398.921 | \n",
- " 9.334323 | \n",
+ " 252 | \n",
+ " 2251.077 | \n",
+ " 8.932845 | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
- " 20 | \n",
- " 21.944 | \n",
- " 1.097200 | \n",
+ " 17 | \n",
+ " 18.260 | \n",
+ " 1.074118 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " depth count total_ms mean_ms\n",
- "0 1 1628 8510.435 5.227540\n",
- "1 2 1962 5433.074 2.769151\n",
- "2 3 1311 5042.420 3.846240\n",
- "3 4 2548 7730.989 3.034140\n",
- "4 5 2112 10514.270 4.978348\n",
- "5 6 257 2398.921 9.334323\n",
- "6 7 20 21.944 1.097200"
+ " depth count total_ms mean_ms\n",
+ "0 1 1564 7735.584 4.946026\n",
+ "1 2 1873 4943.003 2.639083\n",
+ "2 3 1256 4686.911 3.731617\n",
+ "3 4 2336 6986.861 2.990951\n",
+ "4 5 1969 9640.982 4.896385\n",
+ "5 6 252 2251.077 8.932845\n",
+ "6 7 17 18.260 1.074118"
]
},
"metadata": {},
@@ -882,8 +882,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Found 1,253 trace files\n",
- "Total size: 18.79 GB\n"
+ "Found 1,279 trace files\n",
+ "Total size: 17.20 GB\n"
]
}
],
@@ -919,7 +919,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing 1,253 files with 384 workers...\n",
+ "Processing 1,279 files with 384 workers...\n",
"\n"
]
},
@@ -929,7 +929,7 @@
"text": [
"/home/AMD/jshumway/composable_kernel/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
- "Processing: 100%|██████████| 1253/1253 [00:46<00:00, 27.13files/s] \n"
+ "Processing: 100%|██████████| 1279/1279 [00:45<00:00, 28.08files/s] \n"
]
},
{
@@ -937,19 +937,19 @@
"output_type": "stream",
"text": [
"\n",
- "Parsing complete in 48.54s (25.8 files/sec)\n",
+ "Parsing complete in 54.47s (23.5 files/sec)\n",
"\n",
"Combining results...\n",
- "Combined in 4.73s\n",
+ "Combined in 3.85s\n",
"\n",
- "Total analysis time: 53.27s\n",
+ "Total analysis time: 58.32s\n",
"\n",
"Combined Tables:\n",
- " Templates: 11,751,350 rows\n",
- " Instantiations: 17,437,000 rows\n",
- " Template Args: 155,281,743 rows\n",
- " Events: 24,229,252 rows\n",
- " Total memory: 34.06 GB\n"
+ " Templates: 10,655,045 rows\n",
+ " Instantiations: 15,806,599 rows\n",
+ " Template Args: 144,341,805 rows\n",
+ " Events: 22,196,001 rows\n",
+ " Total memory: 31.69 GB\n"
]
}
],
@@ -1067,12 +1067,12 @@
"================================================================================\n",
"BUILD-WIDE STATISTICS\n",
"================================================================================\n",
- "Files processed: 1,253\n",
- "Total events: 24,229,252\n",
- "Total build time: 5915.00 minutes\n",
- "Unique templates: 11,751,350\n",
- "Template instantiations: 17,437,000\n",
- "Template time: 2495.47 minutes (42.2%)\n",
+ "Files processed: 1,279\n",
+ "Total events: 22,196,001\n",
+ "Total build time: 6935.67 minutes\n",
+ "Unique templates: 10,655,045\n",
+ "Template instantiations: 15,806,599\n",
+ "Template time: 2007.82 minutes (28.9%)\n",
"================================================================================\n"
]
}
@@ -1117,7 +1117,7 @@
"output_type": "stream",
"text": [
"Aggregating template statistics...\n",
- "Completed in 3.42s\n",
+ "Completed in 3.10s\n",
"\n",
"Top 20 Templates by Total Time:\n"
]
@@ -1154,275 +1154,253 @@
" \n",
" \n",
" \n",
- " | 11751348 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 2 | \n",
- " 135.700625 | \n",
- " 67850.312500 | \n",
- " 67850.3125 | \n",
- " 2 | \n",
- " 0.090631 | \n",
- "
\n",
- " \n",
- " | 9135552 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135553 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135554 | \n",
- " std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConvNdBwdDa... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 4 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135555 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 3 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135556 | \n",
- " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 2 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135557 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135558 | \n",
- " std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 4 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135559 | \n",
- " std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 4 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135560 | \n",
+ " 7340718 | \n",
" std::__uniq_ptr_impl<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 4 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135561 | \n",
- " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340719 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 6 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135562 | \n",
- " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.088275 | \n",
- "
\n",
- " \n",
- " | 9135563 | \n",
- " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340720 | \n",
+ " std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConv2dBwdDa... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 4 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135564 | \n",
- " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 5 | \n",
- " 0.088275 | \n",
+ " 7340721 | \n",
+ " ck::kernel_batched_elementwise<ck::GridwiseElementwise<ck::Tuple<ck::TensorD... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 8 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135565 | \n",
+ " 7340722 | \n",
" std::tuple<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Input_N_Hi_W... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 4 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135566 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.088275 | \n",
+ " 7340723 | \n",
+ " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 4 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135567 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.088275 | \n",
+ " 7340724 | \n",
+ " std::__and_<std::__and_<std::is_constructible<ck::tensor_operation::device::... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135568 | \n",
- " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340725 | \n",
+ " ck::GridwiseGemm_xdl_cshuffle_conv_v3<ck::tensor_layout::gemm::RowMajor, ck:... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340726 | \n",
+ " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 1 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135569 | \n",
- " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340727 | \n",
+ " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::Embed<ck... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 5 | \n",
- " 0.088275 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
- " | 9135570 | \n",
- " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 3 | \n",
- " 0.088275 | \n",
+ " 7340728 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340729 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340730 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340731 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, _Float16, 8,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340732 | \n",
+ " ck::make_tensor_coordinate_step(const ck::TensorDescriptor<ck::Tuple<ck::Emb... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340733 | \n",
+ " ck::make_tensor_coordinate<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tupl... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340734 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340735 | \n",
+ " std::__and_<std::is_assignable<ck::tensor_operation::device::DeviceGroupedCo... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 4 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340736 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.08737 | \n",
+ "
\n",
+ " \n",
+ " | 7340737 | \n",
+ " std::__format::_Formatting_scanner<std::__format::_Sink_iter<char>, char>::_... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 2 | \n",
+ " 0.08737 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " full_signature \\\n",
- "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
- "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
- "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
- "9135554 std::_TupleConstraints, ck::Sequence<0, 1>>::operator... \n",
- "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
- "9135558 std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... \n",
- "9135559 std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... \n",
- "9135560 std::__uniq_ptr_impl,... \n",
- "9135565 std::tuple, ck::tensor... \n",
- "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
- "9135568 ck::StaticBufferTupleOfVector, ck::tensor_... \n",
+ "7340720 std::_TupleConstraints,... \n",
+ "7340729 ck::TensorDescriptor,... \n",
+ "7340731 ck::StaticTensorTupleOfVectorBuffer,... \n",
+ "7340735 std::__and_, char>::_... \n",
"\n",
- " count total_s mean_ms median_ms depth \\\n",
- "11751348 2 135.700625 67850.312500 67850.3125 2 \n",
- "9135552 846 132.173182 156.233076 1.0550 6 \n",
- "9135553 846 132.173182 156.233076 1.0550 7 \n",
- "9135554 846 132.173182 156.233076 1.0550 4 \n",
- "9135555 846 132.173182 156.233076 1.0550 3 \n",
- "9135556 846 132.173182 156.233076 1.0550 2 \n",
- "9135557 846 132.173182 156.233076 1.0550 6 \n",
- "9135558 846 132.173182 156.233076 1.0550 4 \n",
- "9135559 846 132.173182 156.233076 1.0550 4 \n",
- "9135560 846 132.173182 156.233076 1.0550 4 \n",
- "9135561 846 132.173182 156.233076 1.0550 6 \n",
- "9135562 846 132.173182 156.233076 1.0550 6 \n",
- "9135563 846 132.173182 156.233076 1.0550 4 \n",
- "9135564 846 132.173182 156.233076 1.0550 5 \n",
- "9135565 846 132.173182 156.233076 1.0550 4 \n",
- "9135566 846 132.173182 156.233076 1.0550 7 \n",
- "9135567 846 132.173182 156.233076 1.0550 7 \n",
- "9135568 846 132.173182 156.233076 1.0550 1 \n",
- "9135569 846 132.173182 156.233076 1.0550 5 \n",
- "9135570 846 132.173182 156.233076 1.0550 3 \n",
- "\n",
- " pct_template_time \n",
- "11751348 0.090631 \n",
- "9135552 0.088275 \n",
- "9135553 0.088275 \n",
- "9135554 0.088275 \n",
- "9135555 0.088275 \n",
- "9135556 0.088275 \n",
- "9135557 0.088275 \n",
- "9135558 0.088275 \n",
- "9135559 0.088275 \n",
- "9135560 0.088275 \n",
- "9135561 0.088275 \n",
- "9135562 0.088275 \n",
- "9135563 0.088275 \n",
- "9135564 0.088275 \n",
- "9135565 0.088275 \n",
- "9135566 0.088275 \n",
- "9135567 0.088275 \n",
- "9135568 0.088275 \n",
- "9135569 0.088275 \n",
- "9135570 0.088275 "
+ " count total_s mean_ms median_ms depth pct_template_time \n",
+ "7340718 984 105.25365 106.965091 2.2285 4 0.08737 \n",
+ "7340719 984 105.25365 106.965091 2.2285 6 0.08737 \n",
+ "7340720 984 105.25365 106.965091 2.2285 4 0.08737 \n",
+ "7340721 984 105.25365 106.965091 2.2285 8 0.08737 \n",
+ "7340722 984 105.25365 106.965091 2.2285 4 0.08737 \n",
+ "7340723 984 105.25365 106.965091 2.2285 4 0.08737 \n",
+ "7340724 984 105.25365 106.965091 2.2285 6 0.08737 \n",
+ "7340725 984 105.25365 106.965091 2.2285 6 0.08737 \n",
+ "7340726 984 105.25365 106.965091 2.2285 1 0.08737 \n",
+ "7340727 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340728 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340729 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340730 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340731 984 105.25365 106.965091 2.2285 6 0.08737 \n",
+ "7340732 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340733 984 105.25365 106.965091 2.2285 6 0.08737 \n",
+ "7340734 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340735 984 105.25365 106.965091 2.2285 4 0.08737 \n",
+ "7340736 984 105.25365 106.965091 2.2285 5 0.08737 \n",
+ "7340737 984 105.25365 106.965091 2.2285 2 0.08737 "
]
},
"metadata": {},
@@ -1504,9 +1482,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Filtered to 9,433,574 CK templates (from 11,751,350 total)\n",
- "CK template time: 66204864.83s\n",
- "Percentage of total template time: 44216.7%\n",
+ "Filtered to 8,644,580 CK templates (from 10,655,045 total)\n",
+ "CK template time: 54849416.17s\n",
+ "Percentage of total template time: 45529.9%\n",
"\n",
"Top 20 CK Templates by Total Time:\n"
]
@@ -1543,253 +1521,253 @@
" \n",
" \n",
" \n",
- " | 11751348 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 2 | \n",
- " 135.700625 | \n",
- " 67850.312500 | \n",
- " 67850.3125 | \n",
- " 2 | \n",
- " 0.000205 | \n",
- "
\n",
- " \n",
- " | 9135552 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340719 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 6 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135553 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.000200 | \n",
+ " 7340721 | \n",
+ " ck::kernel_batched_elementwise<ck::GridwiseElementwise<ck::Tuple<ck::TensorD... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 8 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135555 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 3 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135556 | \n",
- " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 2 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135557 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340725 | \n",
+ " ck::GridwiseGemm_xdl_cshuffle_conv_v3<ck::tensor_layout::gemm::RowMajor, ck:... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 6 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135561 | \n",
- " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.000200 | \n",
+ " 7340726 | \n",
+ " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 1 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135562 | \n",
- " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.000200 | \n",
+ " 7340727 | \n",
+ " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::Embed<ck... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135564 | \n",
+ " 7340728 | \n",
" ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 5 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135566 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.000200 | \n",
+ " 7340729 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135567 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 7 | \n",
- " 0.000200 | \n",
+ " 7340730 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135568 | \n",
- " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 1 | \n",
- " 0.000200 | \n",
+ " 7340731 | \n",
+ " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, _Float16, 8,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135569 | \n",
+ " 7340732 | \n",
+ " ck::make_tensor_coordinate_step(const ck::TensorDescriptor<ck::Tuple<ck::Emb... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340733 | \n",
+ " ck::make_tensor_coordinate<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tupl... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340734 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340736 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 5 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340739 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340740 | \n",
+ " ck::BlockwiseGemmXdlops_pipeline_base<64, unsigned short, unsigned short, un... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340741 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 6 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340744 | \n",
+ " ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<2, ck:... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
+ " 2 | \n",
+ " 0.000192 | \n",
+ "
\n",
+ " \n",
+ " | 7340745 | \n",
" ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 5 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135571 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 2 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135577 | \n",
- " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::UnMerge<... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340746 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 5 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
- " | 9135579 | \n",
- " ck::Tuple<ck::vector_type<float, 1>, ck::vector_type<float, 1>, ck::vector_t... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 2 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135582 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
+ " 7340747 | \n",
+ " ck::sequence_gen<2, ck::lambda_get_up_dim_num<ck::Tuple<ck::UnMerge<ck::Tupl... | \n",
+ " 984 | \n",
+ " 105.25365 | \n",
+ " 106.965091 | \n",
+ " 2.2285 | \n",
" 6 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135583 | \n",
- " ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 1 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135584 | \n",
- " ck::to_multi_index<ck::Sequence<4, 12>> | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 2 | \n",
- " 0.000200 | \n",
- "
\n",
- " \n",
- " | 9135585 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso... | \n",
- " 846 | \n",
- " 132.173182 | \n",
- " 156.233076 | \n",
- " 1.0550 | \n",
- " 6 | \n",
- " 0.000200 | \n",
+ " 0.000192 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " full_signature \\\n",
- "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
- "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
- "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
- "9135555 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n",
- "9135556 ck::detail::static_ford_impl, ck::Sequence<0, 1>>::operator... \n",
- "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
- "9135561 ck::StaticTensorTupleOfVectorBuffer,... \n",
- "9135566 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
- "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
- "9135568 ck::StaticBufferTupleOfVector, ck::vector_type, ck::vector_t... \n",
- "9135582 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n",
- "9135583 ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> \n",
- "9135584 ck::to_multi_index> \n",
- "9135585 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n",
+ " full_signature \\\n",
+ "7340719 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "7340721 ck::kernel_batched_elementwise,... \n",
+ "7340729 ck::TensorDescriptor,... \n",
+ "7340731 ck::StaticTensorTupleOfVectorBuffer,... \n",
+ "7340736 ck::TensorDescriptor, ck::tensor_... \n",
+ "7340740 ck::BlockwiseGemmXdlops_pipeline_base<64, unsigned short, unsigned short, un... \n",
+ "7340741 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n",
+ "7340744 ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<2, ck:... \n",
+ "7340745 ck::TensorDescriptor,... \n",
+ "7340747 ck::sequence_gen<2, ck::lambda_get_up_dim_num\n",
" \n",
" \n",
- " | 7243825 | \n",
- " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 7243826 | \n",
- " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 7243827 | \n",
- " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 6304619 | \n",
+ " ck_tile::impl::run_cast_from_f8<_BitInt(8), float, true> | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 7243828 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 6304620 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 6 | \n",
"
\n",
" \n",
- " | 7243829 | \n",
- " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 5 | \n",
+ " 6304621 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7>, ck::Sequence<19>... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 2 | \n",
"
\n",
" \n",
- " | 7243830 | \n",
- " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 4 | \n",
+ " 6304622 | \n",
+ " ck::operator<<(std::ostream &, const TensorDescriptor<Tuple<Embed<Tuple<int,... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 7 | \n",
"
\n",
" \n",
- " | 7243831 | \n",
- " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 7243832 | \n",
+ " 6304623 | \n",
" ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 5 | \n",
"
\n",
" \n",
- " | 7243833 | \n",
- " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 7243834 | \n",
- " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 6304624 | \n",
+ " ck::sequence_gen<8, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<int>... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 4 | \n",
"
\n",
" \n",
- " | 7243835 | \n",
- " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 7243836 | \n",
+ " 6304625 | \n",
" std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 3 | \n",
"
\n",
" \n",
- " | 7243837 | \n",
- " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 7243838 | \n",
- " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 7243839 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 7243840 | \n",
- " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 7243841 | \n",
- " std::pair<const ck::BlockGemmPipelineScheduler, std::basic_string<char>>::pa... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 6304626 | \n",
+ " ck::container_concat<ck::Sequence<0>, ck::Sequence<2, 1>, ck::Sequence<3>, c... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 2 | \n",
"
\n",
" \n",
- " | 7243842 | \n",
- " ck::make_tuple<ck::integral_constant<int, 1>, ck::integral_constant<int, 4>,... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 7243843 | \n",
- " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, unsigned short, 64... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
+ " 6304627 | \n",
+ " ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
" 1 | \n",
"
\n",
" \n",
- " | 7243844 | \n",
- " std::is_nothrow_constructible<std::unique_ptr<ck::tensor_operation::device::... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 4 | \n",
+ " 6304628 | \n",
+ " ck::make_tuple<ck::Sequence<0>, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 6304629 | \n",
+ " std::__and_<std::is_convertible<ck::tensor_operation::device::DeviceGroupedC... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304630 | \n",
+ " ck::detail::unpack_impl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 6304631 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304632 | \n",
+ " ck::sequence_gen<3, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<ck::... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 6304633 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304634 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304635 | \n",
+ " ck::sequence_gen<2, ck::lambda_get_up_dim_num<ck::Tuple<ck::Xor<ck::Tuple<ck... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 6304636 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304637 | \n",
+ " ck::make_tensor_coordinate_step<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck:... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6304638 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 5 | \n",
"
\n",
" \n",
"\n",
@@ -2048,48 +2026,48 @@
],
"text/plain": [
" full_signature \\\n",
- "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n",
- "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n",
- "7243829 ck::utility::launch_and_time_kernel_with_preprocess>::pa... \n",
- "7243842 ck::make_tuple, ck::integral_constant,... \n",
- "7243843 ck::StaticBufferTupleOfVector \n",
+ "6304620 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "6304621 ck::detail::TupleImpl, ck::Sequence<19>... \n",
+ "6304622 ck::operator<<(std::ostream &, const TensorDescriptor... \n",
+ "6304625 std::unique_ptr, ck::Sequence<2, 1>, ck::Sequence<3>, c... \n",
+ "6304627 ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> \n",
+ "6304628 ck::make_tuple, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... \n",
+ "6304629 std::__and_\n",
" \n",
" \n",
- " | 7243825 | \n",
- " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304619 | \n",
+ " ck_tile::impl::run_cast_from_f8<_BitInt(8), float, true> | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243826 | \n",
- " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304620 | \n",
+ " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243827 | \n",
- " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304621 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7>, ck::Sequence<19>... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243828 | \n",
- " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304622 | \n",
+ " ck::operator<<(std::ostream &, const TensorDescriptor<Tuple<Embed<Tuple<int,... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243829 | \n",
- " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
- "
\n",
- " \n",
- " | 7243830 | \n",
- " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
- "
\n",
- " \n",
- " | 7243831 | \n",
- " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
- "
\n",
- " \n",
- " | 7243832 | \n",
+ " 6304623 | \n",
" ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243833 | \n",
- " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304624 | \n",
+ " ck::sequence_gen<8, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<int>... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243834 | \n",
- " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
- "
\n",
- " \n",
- " | 7243835 | \n",
- " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
- "
\n",
- " \n",
- " | 7243836 | \n",
+ " 6304625 | \n",
" std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243837 | \n",
- " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304626 | \n",
+ " ck::container_concat<ck::Sequence<0>, ck::Sequence<2, 1>, ck::Sequence<3>, c... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243838 | \n",
- " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304627 | \n",
+ " ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
- " | 7243839 | \n",
- " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... | \n",
- " 9406 | \n",
- " 27.985313 | \n",
- " 2.975262 | \n",
- " 0.500016 | \n",
+ " 6304628 | \n",
+ " ck::make_tuple<ck::Sequence<0>, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
+ "
\n",
+ " \n",
+ " | 6304629 | \n",
+ " std::__and_<std::is_convertible<ck::tensor_operation::device::DeviceGroupedC... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
+ "
\n",
+ " \n",
+ " | 6304630 | \n",
+ " ck::detail::unpack_impl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
+ "
\n",
+ " \n",
+ " | 6304631 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
+ "
\n",
+ " \n",
+ " | 6304632 | \n",
+ " ck::sequence_gen<3, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<ck::... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
+ "
\n",
+ " \n",
+ " | 6304633 | \n",
+ " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int... | \n",
+ " 9773 | \n",
+ " 23.119388 | \n",
+ " 2.365639 | \n",
+ " 0.500023 | \n",
"
\n",
" \n",
"\n",
@@ -2286,38 +2264,38 @@
],
"text/plain": [
" full_signature \\\n",
- "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n",
- "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n",
- "7243829 ck::utility::launch_and_time_kernel_with_preprocess \n",
+ "6304620 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n",
+ "6304621 ck::detail::TupleImpl, ck::Sequence<19>... \n",
+ "6304622 ck::operator<<(std::ostream &, const TensorDescriptor... \n",
+ "6304625 std::unique_ptr, ck::Sequence<2, 1>, ck::Sequence<3>, c... \n",
+ "6304627 ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> \n",
+ "6304628 ck::make_tuple, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... \n",
+ "6304629 std::__and_\n",
" 0 | \n",
" 0 | \n",
- " 348050 | \n",
- " 2.759851e+03 | \n",
- " 8.315829 | \n",
- " 1.22725 | \n",
- " 1.843240 | \n",
+ " 67779 | \n",
+ " 3.204520e+02 | \n",
+ " 5.207827 | \n",
+ " 1.19350 | \n",
+ " 0.266004 | \n",
" \n",
" \n",
" | 1 | \n",
" 1 | \n",
- " 2007036528 | \n",
- " 9.720831e+06 | \n",
- " 6.516100 | \n",
- " 1.22500 | \n",
- " 6492.316278 | \n",
+ " 1749956922 | \n",
+ " 7.353030e+06 | \n",
+ " 5.587812 | \n",
+ " 1.16350 | \n",
+ " 6103.673137 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
- " 4212093106 | \n",
- " 2.378498e+07 | \n",
- " 7.811773 | \n",
- " 1.28000 | \n",
- " 15885.434415 | \n",
+ " 3669923724 | \n",
+ " 1.799595e+07 | \n",
+ " 6.909845 | \n",
+ " 1.26400 | \n",
+ " 14938.245952 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
- " 2137634038 | \n",
- " 1.261997e+07 | \n",
- " 7.344963 | \n",
- " 1.27700 | \n",
- " 8428.586130 | \n",
+ " 2019427209 | \n",
+ " 1.064961e+07 | \n",
+ " 6.790369 | \n",
+ " 1.26300 | \n",
+ " 8840.127967 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
- " 2777069926 | \n",
- " 1.801647e+07 | \n",
- " 8.056019 | \n",
- " 1.29000 | \n",
- " 12032.783621 | \n",
+ " 2442047025 | \n",
+ " 1.470272e+07 | \n",
+ " 7.563676 | \n",
+ " 1.27750 | \n",
+ " 12204.571997 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
- " 1651653946 | \n",
- " 1.169701e+07 | \n",
- " 8.960966 | \n",
- " 1.31100 | \n",
- " 7812.159446 | \n",
+ " 1644211492 | \n",
+ " 1.041237e+07 | \n",
+ " 8.437405 | \n",
+ " 1.27700 | \n",
+ " 8643.197486 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
- " 781689120 | \n",
- " 8.147449e+06 | \n",
- " 16.840407 | \n",
- " 1.76800 | \n",
- " 5441.491328 | \n",
+ " 756309412 | \n",
+ " 6.665863e+06 | \n",
+ " 15.362248 | \n",
+ " 1.70650 | \n",
+ " 5533.262772 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
- " 69349065 | \n",
- " 7.653282e+05 | \n",
- " 16.927265 | \n",
- " 1.79050 | \n",
- " 511.144843 | \n",
+ " 59804841 | \n",
+ " 5.608413e+05 | \n",
+ " 15.135226 | \n",
+ " 1.73000 | \n",
+ " 465.548483 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
- " 1516615 | \n",
- " 1.622227e+04 | \n",
- " 16.672371 | \n",
- " 1.79625 | \n",
- " 10.834476 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 9 | \n",
- " 2150 | \n",
- " 1.411392e+01 | \n",
- " 7.276202 | \n",
- " 2.63050 | \n",
- " 0.009426 | \n",
+ " 687425 | \n",
+ " 6.183116e+03 | \n",
+ " 14.874348 | \n",
+ " 1.76475 | \n",
+ " 5.132540 | \n",
"
\n",
" \n",
"\n",
@@ -2494,16 +2463,15 @@
],
"text/plain": [
" depth count total_s mean_ms median_ms pct_total\n",
- "0 0 348050 2.759851e+03 8.315829 1.22725 1.843240\n",
- "1 1 2007036528 9.720831e+06 6.516100 1.22500 6492.316278\n",
- "2 2 4212093106 2.378498e+07 7.811773 1.28000 15885.434415\n",
- "3 3 2137634038 1.261997e+07 7.344963 1.27700 8428.586130\n",
- "4 4 2777069926 1.801647e+07 8.056019 1.29000 12032.783621\n",
- "5 5 1651653946 1.169701e+07 8.960966 1.31100 7812.159446\n",
- "6 6 781689120 8.147449e+06 16.840407 1.76800 5441.491328\n",
- "7 7 69349065 7.653282e+05 16.927265 1.79050 511.144843\n",
- "8 8 1516615 1.622227e+04 16.672371 1.79625 10.834476\n",
- "9 9 2150 1.411392e+01 7.276202 2.63050 0.009426"
+ "0 0 67779 3.204520e+02 5.207827 1.19350 0.266004\n",
+ "1 1 1749956922 7.353030e+06 5.587812 1.16350 6103.673137\n",
+ "2 2 3669923724 1.799595e+07 6.909845 1.26400 14938.245952\n",
+ "3 3 2019427209 1.064961e+07 6.790369 1.26300 8840.127967\n",
+ "4 4 2442047025 1.470272e+07 7.563676 1.27750 12204.571997\n",
+ "5 5 1644211492 1.041237e+07 8.437405 1.27700 8643.197486\n",
+ "6 6 756309412 6.665863e+06 15.362248 1.70650 5533.262772\n",
+ "7 7 59804841 5.608413e+05 15.135226 1.73000 465.548483\n",
+ "8 8 687425 6.183116e+03 14.874348 1.76475 5.132540"
]
},
"metadata": {},
@@ -2571,9 +2539,9 @@
"Template Argument Type Distribution:\n",
"Type Count Percentage\n",
"----------------------------------------\n",
- "template 125,537,546 80.8%\n",
- "primitive 19,045,151 12.3%\n",
- "unknown 10,699,046 6.9%\n",
+ "template 116,601,482 80.8%\n",
+ "primitive 17,641,607 12.2%\n",
+ "unknown 10,098,716 7.0%\n",
"\n",
"Templates with Most Arguments:\n"
]
@@ -2605,54 +2573,54 @@
" \n",
" \n",
" \n",
- " | 231420 | \n",
+ " 223300 | \n",
+ " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
+ " 33367 | \n",
+ "
\n",
+ " \n",
+ " | 223301 | \n",
+ " ck::vector_type<ck::bf8_fnuz_t, 8>::(unnamed union at /home/AMD/jshumway/com... | \n",
+ " 33367 | \n",
+ "
\n",
+ " \n",
+ " | 223302 | \n",
+ " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j... | \n",
+ " 33367 | \n",
+ "
\n",
+ " \n",
+ " | 223303 | \n",
" ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8... | \n",
- " 23716 | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231421 | \n",
- " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
- " 23716 | \n",
+ " 223304 | \n",
+ " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 4>, ck::non_native_vecto... | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231422 | \n",
- " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
- " 23716 | \n",
+ " 223305 | \n",
+ " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 2>, ck::non_native_vecto... | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231423 | \n",
- " ck::Tuple<_Float16 __attribute__((ext_vector_type(2))), _Float16 __attribute... | \n",
- " 23716 | \n",
+ " 223306 | \n",
+ " ck::non_native_vector_base<ck::f8_fnuz_t, 64> | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231424 | \n",
- " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j... | \n",
- " 23716 | \n",
+ " 223307 | \n",
+ " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8... | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231425 | \n",
- " ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... | \n",
- " 23716 | \n",
+ " 223308 | \n",
+ " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8... | \n",
+ " 33367 | \n",
"
\n",
" \n",
- " | 231426 | \n",
- " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j... | \n",
- " 23716 | \n",
- "
\n",
- " \n",
- " | 231427 | \n",
- " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,... | \n",
- " 23716 | \n",
- "
\n",
- " \n",
- " | 231428 | \n",
- " ck::vector_type<_Float16, 32>::(unnamed union at /home/AMD/jshumway/composab... | \n",
- " 23716 | \n",
- "
\n",
- " \n",
- " | 231429 | \n",
- " ck::vector_type<signed char, 4>::(unnamed union at /home/AMD/jshumway/compos... | \n",
- " 23716 | \n",
+ " 223309 | \n",
+ " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 2>, ck::non_native_vecto... | \n",
+ " 33367 | \n",
"
\n",
" \n",
"\n",
@@ -2660,28 +2628,28 @@
],
"text/plain": [
" full_signature \\\n",
- "231420 ck::Tuple::(unnamed union at /home/AMD/j... \n",
- "231425 ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... \n",
- "231426 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n",
- "231427 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/composab... \n",
- "231429 ck::vector_type::(unnamed union at /home/AMD/jshumway/compos... \n",
+ "223300 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/com... \n",
+ "223302 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n",
+ "223303 ck::Tuple, ck::non_native_vecto... \n",
+ "223305 ck::Tuple, ck::non_native_vecto... \n",
+ "223306 ck::non_native_vector_base \n",
+ "223307 ck::Tuple, ck::non_native_vecto... \n",
"\n",
" arg_count \n",
- "231420 23716 \n",
- "231421 23716 \n",
- "231422 23716 \n",
- "231423 23716 \n",
- "231424 23716 \n",
- "231425 23716 \n",
- "231426 23716 \n",
- "231427 23716 \n",
- "231428 23716 \n",
- "231429 23716 "
+ "223300 33367 \n",
+ "223301 33367 \n",
+ "223302 33367 \n",
+ "223303 33367 \n",
+ "223304 33367 \n",
+ "223305 33367 \n",
+ "223306 33367 \n",
+ "223307 33367 \n",
+ "223308 33367 \n",
+ "223309 33367 "
]
},
"metadata": {},
@@ -2764,164 +2732,164 @@
" \n",
" \n",
" \n",
- " | 47 | \n",
+ " 48 | \n",
" InstantiateFunction | \n",
- " 15006405 | \n",
- " 2069.145884 | \n",
- " 8.273051 | \n",
- " 34.981333 | \n",
+ " 13837259 | \n",
+ " 1669.984159 | \n",
+ " 7.241250 | \n",
+ " 24.078188 | \n",
"
\n",
" \n",
- " | 30 | \n",
+ " 31 | \n",
" ExecuteCompiler | \n",
- " 1251 | \n",
- " 445.214199 | \n",
- " 21353.198979 | \n",
- " 7.526867 | \n",
+ " 1276 | \n",
+ " 405.041858 | \n",
+ " 19045.855415 | \n",
+ " 5.839980 | \n",
"
\n",
" \n",
- " | 143 | \n",
+ " 145 | \n",
" Total ExecuteCompiler | \n",
- " 1251 | \n",
- " 445.214188 | \n",
- " 21353.198487 | \n",
- " 7.526867 | \n",
+ " 1276 | \n",
+ " 405.041848 | \n",
+ " 19045.854932 | \n",
+ " 5.839980 | \n",
"
\n",
" \n",
- " | 46 | \n",
- " InstantiateClass | \n",
- " 2430595 | \n",
- " 426.324732 | \n",
- " 10.523960 | \n",
- " 7.207519 | \n",
- "
\n",
- " \n",
- " | 147 | \n",
+ " 149 | \n",
" Total Frontend | \n",
- " 1251 | \n",
- " 384.832270 | \n",
- " 18457.183201 | \n",
- " 6.506040 | \n",
+ " 1276 | \n",
+ " 348.010786 | \n",
+ " 16364.143560 | \n",
+ " 5.017694 | \n",
"
\n",
" \n",
- " | 32 | \n",
+ " 33 | \n",
" Frontend | \n",
- " 2419 | \n",
- " 384.832095 | \n",
- " 9545.235926 | \n",
- " 6.506037 | \n",
+ " 2462 | \n",
+ " 348.010603 | \n",
+ " 8481.168229 | \n",
+ " 5.017691 | \n",
"
\n",
" \n",
- " | 241 | \n",
+ " 47 | \n",
+ " InstantiateClass | \n",
+ " 1969340 | \n",
+ " 337.831372 | \n",
+ " 10.292729 | \n",
+ " 4.870925 | \n",
+ "
\n",
+ " \n",
+ " | 243 | \n",
" Total Source | \n",
- " 1251 | \n",
- " 232.585137 | \n",
- " 11155.162444 | \n",
- " 3.932124 | \n",
+ " 1276 | \n",
+ " 220.549562 | \n",
+ " 10370.669049 | \n",
+ " 3.179931 | \n",
"
\n",
" \n",
- " | 168 | \n",
+ " 170 | \n",
" Total InstantiateFunction | \n",
- " 1251 | \n",
- " 187.199754 | \n",
- " 8978.405449 | \n",
- " 3.164831 | \n",
+ " 1276 | \n",
+ " 154.873240 | \n",
+ " 7282.440762 | \n",
+ " 2.232995 | \n",
"
\n",
" \n",
- " | 75 | \n",
+ " 78 | \n",
" PerformPendingInstantiations | \n",
- " 1251 | \n",
- " 109.696772 | \n",
- " 5261.236084 | \n",
- " 1.854552 | \n",
+ " 1276 | \n",
+ " 94.353463 | \n",
+ " 4436.683223 | \n",
+ " 1.360408 | \n",
"
\n",
" \n",
- " | 217 | \n",
+ " 219 | \n",
" Total PerformPendingInstantiations | \n",
- " 1251 | \n",
- " 109.696762 | \n",
- " 5261.235595 | \n",
- " 1.854552 | \n",
+ " 1276 | \n",
+ " 94.353453 | \n",
+ " 4436.682719 | \n",
+ " 1.360408 | \n",
"
\n",
" \n",
- " | 167 | \n",
+ " 169 | \n",
" Total InstantiateClass | \n",
- " 1251 | \n",
- " 101.970700 | \n",
- " 4890.681049 | \n",
- " 1.723934 | \n",
- "
\n",
- " \n",
- " | 211 | \n",
- " Total ParseClass | \n",
- " 1251 | \n",
- " 65.367502 | \n",
- " 3135.131968 | \n",
- " 1.105114 | \n",
- "
\n",
- " \n",
- " | 69 | \n",
- " ParseClass | \n",
- " 615160 | \n",
- " 60.689238 | \n",
- " 5.919361 | \n",
- " 1.026023 | \n",
+ " 1276 | \n",
+ " 84.004119 | \n",
+ " 3950.036931 | \n",
+ " 1.211189 | \n",
"
\n",
" \n",
" | 4 | \n",
" Backend | \n",
- " 1251 | \n",
- " 57.471167 | \n",
- " 2756.410890 | \n",
- " 0.971617 | \n",
+ " 1276 | \n",
+ " 55.376770 | \n",
+ " 2603.923346 | \n",
+ " 0.798434 | \n",
"
\n",
" \n",
- " | 104 | \n",
+ " 106 | \n",
" Total Backend | \n",
- " 1251 | \n",
- " 57.471157 | \n",
- " 2756.410412 | \n",
- " 0.971617 | \n",
- "
\n",
- " \n",
- " | 70 | \n",
- " ParseDeclarationOrFunctionDefinition | \n",
- " 377890 | \n",
- " 50.231873 | \n",
- " 7.975634 | \n",
- " 0.849229 | \n",
- "
\n",
- " \n",
- " | 212 | \n",
- " Total ParseDeclarationOrFunctionDefinition | \n",
- " 1251 | \n",
- " 44.632077 | \n",
- " 2140.627196 | \n",
- " 0.754557 | \n",
- "
\n",
- " \n",
- " | 71 | \n",
- " ParseFunctionDefinition | \n",
- " 410465 | \n",
- " 38.754508 | \n",
- " 5.664967 | \n",
- " 0.655190 | \n",
+ " 1276 | \n",
+ " 55.376759 | \n",
+ " 2603.922835 | \n",
+ " 0.798434 | \n",
"
\n",
" \n",
" | 213 | \n",
- " Total ParseFunctionDefinition | \n",
- " 1251 | \n",
- " 38.036852 | \n",
- " 1824.309444 | \n",
- " 0.643057 | \n",
+ " Total ParseClass | \n",
+ " 1276 | \n",
+ " 52.732797 | \n",
+ " 2479.598586 | \n",
+ " 0.760313 | \n",
"
\n",
" \n",
- " | 68 | \n",
+ " 72 | \n",
+ " ParseClass | \n",
+ " 503019 | \n",
+ " 47.920679 | \n",
+ " 5.715969 | \n",
+ " 0.690931 | \n",
+ "
\n",
+ " \n",
+ " | 73 | \n",
+ " ParseDeclarationOrFunctionDefinition | \n",
+ " 320116 | \n",
+ " 44.703333 | \n",
+ " 8.378838 | \n",
+ " 0.644542 | \n",
+ "
\n",
+ " \n",
+ " | 71 | \n",
" Optimizer | \n",
- " 1251 | \n",
- " 37.729725 | \n",
- " 1809.579141 | \n",
- " 0.637865 | \n",
+ " 1276 | \n",
+ " 36.732533 | \n",
+ " 1727.235099 | \n",
+ " 0.529617 | \n",
+ "
\n",
+ " \n",
+ " | 209 | \n",
+ " Total Optimizer | \n",
+ " 1276 | \n",
+ " 36.732523 | \n",
+ " 1727.234601 | \n",
+ " 0.529617 | \n",
+ "
\n",
+ " \n",
+ " | 214 | \n",
+ " Total ParseDeclarationOrFunctionDefinition | \n",
+ " 1276 | \n",
+ " 35.048983 | \n",
+ " 1648.071277 | \n",
+ " 0.505344 | \n",
+ "
\n",
+ " \n",
+ " | 215 | \n",
+ " Total ParseFunctionDefinition | \n",
+ " 1276 | \n",
+ " 29.408540 | \n",
+ " 1382.846726 | \n",
+ " 0.424019 | \n",
"
\n",
" \n",
"\n",
@@ -2929,48 +2897,48 @@
],
"text/plain": [
" event_type count total_min \\\n",
- "47 InstantiateFunction 15006405 2069.145884 \n",
- "30 ExecuteCompiler 1251 445.214199 \n",
- "143 Total ExecuteCompiler 1251 445.214188 \n",
- "46 InstantiateClass 2430595 426.324732 \n",
- "147 Total Frontend 1251 384.832270 \n",
- "32 Frontend 2419 384.832095 \n",
- "241 Total Source 1251 232.585137 \n",
- "168 Total InstantiateFunction 1251 187.199754 \n",
- "75 PerformPendingInstantiations 1251 109.696772 \n",
- "217 Total PerformPendingInstantiations 1251 109.696762 \n",
- "167 Total InstantiateClass 1251 101.970700 \n",
- "211 Total ParseClass 1251 65.367502 \n",
- "69 ParseClass 615160 60.689238 \n",
- "4 Backend 1251 57.471167 \n",
- "104 Total Backend 1251 57.471157 \n",
- "70 ParseDeclarationOrFunctionDefinition 377890 50.231873 \n",
- "212 Total ParseDeclarationOrFunctionDefinition 1251 44.632077 \n",
- "71 ParseFunctionDefinition 410465 38.754508 \n",
- "213 Total ParseFunctionDefinition 1251 38.036852 \n",
- "68 Optimizer 1251 37.729725 \n",
+ "48 InstantiateFunction 13837259 1669.984159 \n",
+ "31 ExecuteCompiler 1276 405.041858 \n",
+ "145 Total ExecuteCompiler 1276 405.041848 \n",
+ "149 Total Frontend 1276 348.010786 \n",
+ "33 Frontend 2462 348.010603 \n",
+ "47 InstantiateClass 1969340 337.831372 \n",
+ "243 Total Source 1276 220.549562 \n",
+ "170 Total InstantiateFunction 1276 154.873240 \n",
+ "78 PerformPendingInstantiations 1276 94.353463 \n",
+ "219 Total PerformPendingInstantiations 1276 94.353453 \n",
+ "169 Total InstantiateClass 1276 84.004119 \n",
+ "4 Backend 1276 55.376770 \n",
+ "106 Total Backend 1276 55.376759 \n",
+ "213 Total ParseClass 1276 52.732797 \n",
+ "72 ParseClass 503019 47.920679 \n",
+ "73 ParseDeclarationOrFunctionDefinition 320116 44.703333 \n",
+ "71 Optimizer 1276 36.732533 \n",
+ "209 Total Optimizer 1276 36.732523 \n",
+ "214 Total ParseDeclarationOrFunctionDefinition 1276 35.048983 \n",
+ "215 Total ParseFunctionDefinition 1276 29.408540 \n",
"\n",
" mean_ms pct_total \n",
- "47 8.273051 34.981333 \n",
- "30 21353.198979 7.526867 \n",
- "143 21353.198487 7.526867 \n",
- "46 10.523960 7.207519 \n",
- "147 18457.183201 6.506040 \n",
- "32 9545.235926 6.506037 \n",
- "241 11155.162444 3.932124 \n",
- "168 8978.405449 3.164831 \n",
- "75 5261.236084 1.854552 \n",
- "217 5261.235595 1.854552 \n",
- "167 4890.681049 1.723934 \n",
- "211 3135.131968 1.105114 \n",
- "69 5.919361 1.026023 \n",
- "4 2756.410890 0.971617 \n",
- "104 2756.410412 0.971617 \n",
- "70 7.975634 0.849229 \n",
- "212 2140.627196 0.754557 \n",
- "71 5.664967 0.655190 \n",
- "213 1824.309444 0.643057 \n",
- "68 1809.579141 0.637865 "
+ "48 7.241250 24.078188 \n",
+ "31 19045.855415 5.839980 \n",
+ "145 19045.854932 5.839980 \n",
+ "149 16364.143560 5.017694 \n",
+ "33 8481.168229 5.017691 \n",
+ "47 10.292729 4.870925 \n",
+ "243 10370.669049 3.179931 \n",
+ "170 7282.440762 2.232995 \n",
+ "78 4436.683223 1.360408 \n",
+ "219 4436.682719 1.360408 \n",
+ "169 3950.036931 1.211189 \n",
+ "4 2603.923346 0.798434 \n",
+ "106 2603.922835 0.798434 \n",
+ "213 2479.598586 0.760313 \n",
+ "72 5.715969 0.690931 \n",
+ "73 8.378838 0.644542 \n",
+ "71 1727.235099 0.529617 \n",
+ "209 1727.234601 0.529617 \n",
+ "214 1648.071277 0.505344 \n",
+ "215 1382.846726 0.424019 "
]
},
"metadata": {},
@@ -3026,9 +2994,9 @@
"output_type": "stream",
"text": [
"Found ninja log: ../../../build-trace/.ninja_log\n",
- "Parsed 2,579 build events in 0.004s\n",
+ "Parsed 2,536 build events in 0.003s\n",
"\n",
- "Builds DataFrame: 2,579 rows\n"
+ "Builds DataFrame: 2,536 rows\n"
]
},
{
@@ -3082,29 +3050,29 @@
" \n",
" | 2 | \n",
" library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... | \n",
- " 228 | \n",
- " 9471 | \n",
+ " 354 | \n",
+ " 9957 | \n",
" b37d14fd75e2d29c | \n",
" -1 | \n",
- " 9243 | \n",
+ " 9603 | \n",
"
\n",
" \n",
" | 3 | \n",
" _deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o | \n",
- " 103 | \n",
- " 9527 | \n",
+ " 225 | \n",
+ " 9931 | \n",
" 330bec9abe978cde | \n",
" -1 | \n",
- " 9424 | \n",
+ " 9706 | \n",
"
\n",
" \n",
" | 4 | \n",
" library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi... | \n",
- " 243 | \n",
- " 9687 | \n",
+ " 365 | \n",
+ " 229598 | \n",
" 6a5b0a35a99473e8 | \n",
" -1 | \n",
- " 9444 | \n",
+ " 229233 | \n",
"
\n",
" \n",
"\n",
@@ -3121,9 +3089,9 @@
" start_ms end_ms cmd_hash worker_id duration_ms \n",
"0 487 511 20b4a9dba5446149 -1 24 \n",
"1 512 159966 80f2b673d9d1b995 -1 159454 \n",
- "2 228 9471 b37d14fd75e2d29c -1 9243 \n",
- "3 103 9527 330bec9abe978cde -1 9424 \n",
- "4 243 9687 6a5b0a35a99473e8 -1 9444 "
+ "2 354 9957 b37d14fd75e2d29c -1 9603 \n",
+ "3 225 9931 330bec9abe978cde -1 9706 \n",
+ "4 365 229598 6a5b0a35a99473e8 -1 229233 "
]
},
"metadata": {},
diff --git a/script/analyze_build/trace_analysis/__init__.py b/script/analyze_build/trace_analysis/__init__.py
index 22fc3e0b5f..cb43359d71 100644
--- a/script/analyze_build/trace_analysis/__init__.py
+++ b/script/analyze_build/trace_analysis/__init__.py
@@ -71,6 +71,7 @@ from .parser import TraceParser
from .transformer import TraceTransformer
from .template_parser import TemplateParser
from .ninja_parser import NinjaLogParser
+from .chrome_trace import ChromeTraceExporter
from .utils import find_trace_files
__all__ = [
@@ -87,6 +88,8 @@ __all__ = [
"NinjaLogParser",
"NinjaBuild",
"CompilationTimeline",
+ # Chrome Trace export
+ "ChromeTraceExporter",
# Metadata and statistics
"FileMetadata",
"BuildStatistics",
diff --git a/script/analyze_build/trace_analysis/chrome_trace.py b/script/analyze_build/trace_analysis/chrome_trace.py
new file mode 100644
index 0000000000..5e6a797314
--- /dev/null
+++ b/script/analyze_build/trace_analysis/chrome_trace.py
@@ -0,0 +1,133 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Chrome Trace Event Format export for Perfetto visualization.
+
+Exports ninja build timeline data to Chrome Trace Event Format
+for visualization in Perfetto UI within Jupyter notebooks.
+"""
+
+from pathlib import Path
+from typing import Dict, Any
+import pandas as pd
+
+
+class ChromeTraceExporter:
+ """Export trace analysis data to Chrome Trace Event Format."""
+
+ @staticmethod
+ def categorize_target(target: str) -> str:
+ """
+ Categorize a build target based on file extension.
+
+ Args:
+ target: Build target name (e.g., "obj/foo.o")
+
+ Returns:
+ Category string for Chrome Trace format
+ """
+ ext = Path(target).suffix.lower()
+
+ if ext in [".o", ".obj"]:
+ return "compile"
+ elif ext in [".a", ".lib"]:
+ return "archive"
+ elif ext in [".so", ".dll", ".dylib"]:
+ return "link_shared"
+ elif ext in [".exe", ".out"]:
+ return "link_executable"
+ elif "test" in target.lower():
+ return "test"
+ else:
+ return "other"
+
+ @staticmethod
+ def export_ninja_timeline(
+ builds_df: pd.DataFrame, process_id: int = 1, include_metadata: bool = True
+ ) -> Dict[str, Any]:
+ """
+ Export ninja build timeline to Chrome Trace format.
+
+ Creates trace events compatible with Perfetto UI for visualization
+ in Jupyter notebooks or chrome://tracing.
+
+ Args:
+ builds_df: DataFrame with columns: target, start_ms, end_ms,
+ duration_ms, worker_id, cmd_hash
+ process_id: Process ID for trace events (default: 1)
+ include_metadata: Include trace metadata (default: True)
+
+ Returns:
+ Dictionary in Chrome Trace Event Format:
+ {
+ 'traceEvents': [...],
+ 'displayTimeUnit': 'ms',
+ 'otherData': {...}
+ }
+
+ Example:
+ >>> from trace_analysis import NinjaLogParser, ChromeTraceExporter
+ >>> builds = NinjaLogParser.parse(Path('build/.ninja_log'))
+ >>> builds_df = NinjaLogParser.to_dataframe(builds)
+ >>> builds_df = NinjaLogParser.assign_workers(builds_df)
+ >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+ >>> # Display in notebook or save to file
+ """
+ if len(builds_df) == 0:
+ return {
+ "traceEvents": [],
+ "displayTimeUnit": "ms",
+ "otherData": {"version": "1.0", "generator": "analyze_build"},
+ }
+
+ events = []
+
+ for _, row in builds_df.iterrows():
+ # Categorize based on file extension
+ category = ChromeTraceExporter.categorize_target(row["target"])
+
+ # Create Chrome Trace event
+ event = {
+ "name": row["target"],
+ "cat": category,
+ "ph": "X", # Complete event (has duration)
+ "ts": int(row["start_ms"] * 1000), # Convert to microseconds
+ "dur": int(row["duration_ms"] * 1000), # Convert to microseconds
+ "pid": process_id,
+ "tid": int(row["worker_id"]),
+ "args": {
+ "output": row["target"],
+ "duration_ms": int(row["duration_ms"]),
+ "cmd_hash": row["cmd_hash"],
+ },
+ }
+
+ events.append(event)
+
+ if include_metadata:
+ return {
+ "traceEvents": events,
+ "displayTimeUnit": "ms",
+ "otherData": {"version": "1.0", "generator": "analyze_build"},
+ }
+ else:
+ # Simple format (just events array)
+ return {"traceEvents": events}
+
+ @staticmethod
+ def export_to_file(trace_data: Dict[str, Any], output_path: str) -> None:
+ """
+ Export trace data to a JSON file.
+
+ Args:
+ trace_data: Chrome Trace format dictionary
+ output_path: Path to output file
+
+ Example:
+ >>> ChromeTraceExporter.export_to_file(trace_data, 'trace.json')
+ """
+ import json
+
+ with open(output_path, "w") as f:
+ json.dump(trace_data, f, indent=2)
diff --git a/script/analyze_build/trace_analysis/perfetto_display.py b/script/analyze_build/trace_analysis/perfetto_display.py
new file mode 100644
index 0000000000..910c2a8064
--- /dev/null
+++ b/script/analyze_build/trace_analysis/perfetto_display.py
@@ -0,0 +1,190 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Perfetto UI display utilities for Jupyter notebooks.
+
+Provides functions to display Chrome Trace data in Perfetto UI
+directly within Jupyter notebooks.
+"""
+
+import json
+import base64
+from typing import Dict, Any, Optional
+
+
+def display_perfetto(trace_data: Dict[str, Any], height: int = 600):
+ """
+ Display Perfetto UI in Jupyter notebook with embedded trace data.
+
+ Args:
+ trace_data: Chrome Trace Event Format dictionary
+ height: Height of the IFrame in pixels (default: 600)
+
+ Returns:
+ IPython IFrame object for display in notebook
+
+ Example:
+ >>> from trace_analysis import NinjaLogParser, ChromeTraceExporter
+ >>> from trace_analysis.perfetto_display import display_perfetto
+ >>> builds_df = NinjaLogParser.to_dataframe(builds)
+ >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df)
+ >>> display_perfetto(trace_data)
+
+ Note:
+ This function requires IPython to be installed (available in Jupyter).
+ The trace data is base64-encoded and embedded in the Perfetto UI URL.
+ For very large traces (>10MB), consider using save_and_link() instead.
+ """
+ try:
+ from IPython.display import IFrame
+ except ImportError:
+ raise ImportError(
+ "IPython is required for display_perfetto(). "
+ "Install it with: pip install ipython"
+ )
+
+ # Convert trace to JSON string
+ trace_json = json.dumps(trace_data)
+
+ # Base64 encode for URL
+ trace_b64 = base64.b64encode(trace_json.encode()).decode()
+
+ # Perfetto UI URL with embedded trace
+ perfetto_url = f"https://ui.perfetto.dev/#!/?s={trace_b64}"
+
+ # Display in IFrame
+ return IFrame(perfetto_url, width="100%", height=height)
+
+
+def save_and_link(
+ trace_data: Dict[str, Any], output_path: str, link_text: Optional[str] = None
+):
+ """
+ Save trace to file and display a link to open in Perfetto UI.
+
+ This is useful for large traces that are too big to embed in a URL.
+
+ Args:
+ trace_data: Chrome Trace Event Format dictionary
+ output_path: Path to save the trace file
+ link_text: Custom link text (default: "Open trace in Perfetto UI")
+
+ Returns:
+ IPython HTML object with download link and instructions
+
+ Example:
+ >>> save_and_link(trace_data, '../data/build_trace.json')
+
+ Note:
+ The user will need to manually upload the saved file to
+ https://ui.perfetto.dev
+ """
+ try:
+ from IPython.display import HTML
+ except ImportError:
+ raise ImportError(
+ "IPython is required for save_and_link(). "
+ "Install it with: pip install ipython"
+ )
+
+ # Save trace to file
+ with open(output_path, "w") as f:
+ json.dump(trace_data, f, indent=2)
+
+ if link_text is None:
+ link_text = "Open trace in Perfetto UI"
+
+ # Create HTML with instructions
+ html = f"""
+
+
Trace saved to: {output_path}
+
To view in Perfetto UI:
+
+ - Go to {link_text}
+ - Click "Open trace file" and select:
{output_path}
+
+
Or drag and drop the file directly into the Perfetto UI.
+
+ """
+
+ return HTML(html)
+
+
+def get_trace_summary(trace_data: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Get summary statistics from trace data.
+
+ Args:
+ trace_data: Chrome Trace Event Format dictionary
+
+ Returns:
+ Dictionary with summary statistics
+
+ Example:
+ >>> summary = get_trace_summary(trace_data)
+ >>> print(f"Total events: {summary['event_count']}")
+ >>> print(f"Total time: {summary['total_duration_s']:.2f}s")
+ """
+ events = trace_data.get("traceEvents", [])
+
+ if not events:
+ return {
+ "event_count": 0,
+ "total_duration_s": 0.0,
+ "categories": {},
+ "worker_count": 0,
+ }
+
+ # Count events by category
+ categories = {}
+ total_duration_us = 0
+ worker_ids = set()
+
+ for event in events:
+ cat = event.get("cat", "unknown")
+ categories[cat] = categories.get(cat, 0) + 1
+
+ dur = event.get("dur", 0)
+ total_duration_us += dur
+
+ tid = event.get("tid")
+ if tid is not None:
+ worker_ids.add(tid)
+
+ return {
+ "event_count": len(events),
+ "total_duration_s": total_duration_us / 1e6,
+ "categories": categories,
+ "worker_count": len(worker_ids),
+ }
+
+
+def print_trace_summary(trace_data: Dict[str, Any]) -> None:
+ """
+ Print a formatted summary of trace data.
+
+ Args:
+ trace_data: Chrome Trace Event Format dictionary
+
+ Example:
+ >>> print_trace_summary(trace_data)
+ === Trace Summary ===
+ Total events: 1,234
+ Total duration: 123.45s
+ Workers: 8
+ ...
+ """
+ summary = get_trace_summary(trace_data)
+
+ print("=== Trace Summary ===")
+ print(f"Total events: {summary['event_count']:,}")
+ print(f"Total duration: {summary['total_duration_s']:.2f}s")
+ print(f"Workers: {summary['worker_count']}")
+
+ if summary["categories"]:
+ print("\nEvents by category:")
+ for cat, count in sorted(
+ summary["categories"].items(), key=lambda x: x[1], reverse=True
+ ):
+ print(f" {cat:15} {count:6,} events")