diff --git a/script/analyze_build/docs/CHROME_TRACE_EXPORT.md b/script/analyze_build/docs/CHROME_TRACE_EXPORT.md new file mode 100644 index 0000000000..de079f6b3a --- /dev/null +++ b/script/analyze_build/docs/CHROME_TRACE_EXPORT.md @@ -0,0 +1,960 @@ +# Chrome Trace Export for Cross-Validation + +**Status**: Design Document +**Author**: Build Analysis Team +**Date**: January 2026 +**Version**: 1.0 + +## Executive Summary + +This document proposes adding Chrome Trace Event Format export capabilities to the `analyze_build` library to enable cross-validation with the existing `ninja_json_converter.py` tool. The two tools serve complementary purposes and this enhancement will allow verification of data consistency between them. + +## Background + +### Current State: Two Complementary Tools + +The project currently has two distinct build analysis tools: + +#### 1. `ninja_json_converter.py` - Build System Monitoring +- **Purpose**: Monitor build-level parallelism and efficiency +- **Primary Users**: Build engineers, CI/CD optimization teams +- **Key Metrics**: Worker utilization, critical path, slow compilation units +- **Output Format**: Chrome Trace Event Format (JSON) +- **Granularity**: File-level (compilation units) +- **Visualization**: Perfetto / Chrome Tracing UI +- **Use Cases**: + - Is our build sharding efficient? + - Which files are compilation bottlenecks? + - How well are we utilizing available CPU cores? + - What's the critical path in our build? + +#### 2. `analyze_build` Library - Compiler Performance Analysis +- **Purpose**: Deep analysis of C++ template metaprogramming costs +- **Primary Users**: C++ developers, library maintainers, performance engineers +- **Key Metrics**: Template instantiation times, template relationships, compiler event breakdown +- **Output Format**: Pandas DataFrames for statistical analysis +- **Granularity**: Template-level and compiler event-level (within compilation) +- **Visualization**: Jupyter notebooks with statistical analysis +- **Use Cases**: + - Which templates are most expensive to instantiate? + - What are the template dependency relationships? + - How can we optimize our metaprogramming patterns? + - How can we measure improved build times with better metaprogramming? + - What percentage of build time is template instantiation? + +### The Problem: Need for Cross-Validation + +Currently, these tools operate independently with no mechanism to verify consistency. This creates several challenges: + +1. **Data Accuracy**: No way to verify both tools are parsing the same underlying data correctly +2. **Discrepancy Detection**: When numbers differ, unclear which tool is correct +3. **Cross-Referencing**: Difficult to correlate findings (e.g., "slow file in ninja" vs "high template time in analyzer") +4. **Debugging**: Hard to diagnose when tools report different build times +5. **Trust**: Users may question which tool's numbers to believe + +## Goals and Non-Goals + +### Primary Goals + +1. **Enable Cross-Validation**: Export analyze_build data to Chrome Trace format for comparison with ninja_json_converter +2. **Verify Consistency**: Provide utilities to compare outputs and identify discrepancies +3. **Sanity Checking**: Quick visual verification in Perfetto that data looks correct +4. **Cross-Reference Findings**: Correlate slow files with expensive templates + +### Secondary Goals + +1. **Template Event Visualization**: Optionally export template instantiation events as additional trace layer +2. **Debugging Support**: Help diagnose when tools report different results +3. **Documentation**: Clear workflow for validation process + +### Explicit Non-Goals + +1. **Not Replacing ninja_json_converter**: The tools serve different purposes and both should continue to exist +2. **Not Full-Featured Visualization**: analyze_build focuses on statistical analysis, not interactive timelines +3. **Not Advanced Timeline Features**: Keep it simple - just export for validation +4. **Not Multi-Build Comparison**: ninja_json_converter already handles this well + +## Technical Design + +### Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────┐ +│ analyze_build Library │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ NinjaParser │─────▶│ builds_df │ │ +│ └──────────────┘ └──────┬───────┘ │ +│ │ │ +│ ┌──────────────┐ ┌──────▼───────┐ │ +│ │ TraceParser │─────▶│ events_df │ │ +│ └──────────────┘ └──────┬───────┘ │ +│ │ │ +│ ┌──────▼───────────┐ │ +│ │ ChromeTraceExporter│ │ +│ └──────┬───────────┘ │ +│ │ │ +│ ┌──────▼───────────┐ │ +│ │ trace_events │ │ +│ │ (Chrome Format) │ │ +│ └──────┬───────────┘ │ +│ │ │ +└───────────────────────────────┼──────────────────────────────┘ + │ + ┌───────────▼────────────┐ + │ Validation Utilities │ + └───────────┬────────────┘ + │ + ┌───────────▼────────────┐ + │ ninja_json_converter │ + │ output │ + └────────────────────────┘ +``` + +### New Module: `trace_analysis/chrome_trace.py` + +```python +""" +Chrome Trace Event Format export for cross-validation. + +Exports trace analysis data to Chrome Trace Event Format compatible +with ninja_json_converter.py output for validation purposes. +""" + +from typing import Dict, List, Optional, Any +import pandas as pd + + +class ChromeTraceExporter: + """Export trace analysis data to Chrome Trace Event Format.""" + + @staticmethod + def export_ninja_timeline( + builds_df: pd.DataFrame, + process_id: int = 1, + include_metadata: bool = True + ) -> Dict[str, Any]: + """ + Export ninja build timeline to Chrome Trace format. + + Creates trace events compatible with ninja_json_converter.py output + for cross-validation purposes. + + Args: + builds_df: DataFrame with columns: target, start_ms, end_ms, + duration_ms, worker_id, (optional) category + process_id: Process ID for trace events (default: 1) + include_metadata: Include trace metadata (default: True) + + Returns: + Dictionary in Chrome Trace Event Format: + { + 'traceEvents': [...], + 'displayTimeUnit': 'ms', + 'otherData': {...} + } + + Example: + >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + >>> with open('trace.json', 'w') as f: + ... json.dump(trace_data, f) + """ + + @staticmethod + def export_template_events( + instantiations_df: pd.DataFrame, + templates_df: pd.DataFrame, + builds_df: pd.DataFrame, + process_id: int = 1, + granularity_us: int = 50000 + ) -> Dict[str, Any]: + """ + Export template instantiation events as Chrome Trace layer. + + Creates template-level trace events that can be overlaid on the + ninja build timeline for detailed compiler analysis. + + Args: + instantiations_df: Template instantiation events + templates_df: Template definitions + builds_df: Ninja builds (for timing alignment) + process_id: Process ID for trace events + granularity_us: Minimum duration threshold in microseconds + + Returns: + Chrome Trace Event Format dictionary with template events + + Note: + Template events are aligned with ninja build timing and + filtered by granularity threshold to reduce trace size. + """ + + @staticmethod + def merge_traces( + ninja_trace: Dict[str, Any], + template_trace: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Merge ninja and template traces into single trace file. + + Combines build-level and template-level events for unified + visualization in Perfetto. + + Args: + ninja_trace: Ninja build timeline trace + template_trace: Template instantiation trace + + Returns: + Merged trace with both event types + """ +``` + +### New Module: `trace_analysis/validation.py` + +```python +""" +Validation utilities for cross-checking trace analysis tools. + +Compares outputs from analyze_build and ninja_json_converter to +verify data consistency and identify discrepancies. +""" + +from typing import Dict, List, Any, Optional +import pandas as pd + + +class TraceValidator: + """Validate consistency between trace analysis tools.""" + + @staticmethod + def compare_traces( + analyzer_trace: Dict[str, Any], + ninja_converter_trace: Dict[str, Any], + tolerance_ms: float = 1.0 + ) -> Dict[str, Any]: + """ + Compare Chrome Trace outputs from both tools. + + Validates that analyze_build and ninja_json_converter produce + consistent results from the same underlying data. + + Args: + analyzer_trace: Trace from ChromeTraceExporter + ninja_converter_trace: Trace from ninja_json_converter.py + tolerance_ms: Acceptable time difference in milliseconds + + Returns: + Validation report: + { + 'total_time_match': bool, + 'total_time_diff_ms': float, + 'event_count_match': bool, + 'event_count_diff': int, + 'file_discrepancies': [ + { + 'file': str, + 'analyzer_ms': float, + 'ninja_ms': float, + 'diff_ms': float, + 'diff_pct': float + } + ], + 'summary': str + } + """ + + @staticmethod + def validate_ninja_log_parsing( + builds_df: pd.DataFrame, + ninja_log_path: str + ) -> Dict[str, Any]: + """ + Validate that NinjaLogParser correctly parsed .ninja_log. + + Cross-checks parsed DataFrame against raw .ninja_log file + to ensure no data loss or corruption. + + Args: + builds_df: Parsed builds DataFrame + ninja_log_path: Path to original .ninja_log file + + Returns: + Validation report with any parsing issues + """ + + @staticmethod + def generate_validation_report( + validation_results: Dict[str, Any], + output_path: Optional[str] = None + ) -> str: + """ + Generate human-readable validation report. + + Creates formatted report of validation results for review. + + Args: + validation_results: Results from compare_traces() + output_path: Optional path to save report + + Returns: + Formatted report string + """ +``` + +### Data Flow + +``` +1. Parse .ninja_log + └─> NinjaLogParser.parse() -> builds_df + +2. Export to Chrome Trace + └─> ChromeTraceExporter.export_ninja_timeline(builds_df) -> trace_data + +3. Save trace file + └─> json.dump(trace_data, 'analyzer_trace.json') + +4. Generate ninja_json_converter trace (separately) + └─> python ninja_json_converter.py .ninja_log -o ninja_trace.json + +5. Validate consistency + └─> TraceValidator.compare_traces(analyzer_trace, ninja_trace) -> report + +6. Review discrepancies + └─> TraceValidator.generate_validation_report(report) +``` + +## Chrome Trace Event Format Specification + +### Event Structure + +Each trace event follows the Chrome Trace Event Format: + +```json +{ + "name": "target_name.o", + "cat": "compile", + "ph": "X", + "ts": 1234567890, + "dur": 5000000, + "pid": 1, + "tid": 3, + "args": { + "output": "target_name.o", + "duration_ms": 5000, + "cmd_hash": "abc123" + } +} +``` + +**Field Descriptions:** +- `name`: Target name (file being built) +- `cat`: Category (compile, link_shared, link_executable, archive, test, other) +- `ph`: Phase ("X" for complete events) +- `ts`: Timestamp in microseconds +- `dur`: Duration in microseconds +- `pid`: Process ID (1 for ninja builds) +- `tid`: Thread ID (worker ID) +- `args`: Additional metadata + +### Trace File Structure + +```json +{ + "traceEvents": [ + { /* event 1 */ }, + { /* event 2 */ }, + ... + ], + "displayTimeUnit": "ms", + "otherData": { + "version": "1.0", + "generator": "trace_analysis", + "source": "analyze_build" + } +} +``` + +### Compatibility with ninja_json_converter + +The export format must be **byte-for-byte compatible** with ninja_json_converter output for the same input data, with these exceptions: + +**Acceptable Differences:** +- `otherData.generator`: Different tool name +- Event ordering: May differ if timestamps are identical +- Floating point precision: ±0.001ms acceptable + +**Must Match Exactly:** +- Total build time +- Per-file durations (within tolerance) +- Worker assignments +- Event counts +- Category assignments + +## Validation Strategy + +### Validation Checks + +1. **Total Build Time** + - Sum of all event durations + - Should match within ±1ms (rounding tolerance) + +2. **Event Count** + - Number of trace events + - Should match exactly + +3. **Per-File Duration** + - Duration for each compilation unit + - Should match within ±1ms per file + +4. **Worker Assignment** + - Thread ID (worker) for each event + - Should match exactly (deterministic algorithm) + +5. **Category Assignment** + - Event category based on file extension + - Should match exactly + +### Expected Discrepancies + +Some differences are expected and acceptable: + +1. **Timestamp Precision**: Microsecond rounding differences +2. **Event Ordering**: When timestamps are identical +3. **Metadata Fields**: Different tool names, versions +4. **Floating Point**: Minor precision differences (< 0.001ms) + +### Validation Workflow + +```python +# 1. Generate trace from analyze_build +from trace_analysis import NinjaLogParser, ChromeTraceExporter +import json + +builds = NinjaLogParser.parse(Path('.ninja_log')) +builds_df = NinjaLogParser.to_dataframe(builds) +analyzer_trace = ChromeTraceExporter.export_ninja_timeline(builds_df) + +with open('analyzer_trace.json', 'w') as f: + json.dump(analyzer_trace, f) + +# 2. Generate trace from ninja_json_converter (shell) +# $ python script/ninja_json_converter.py .ninja_log -o ninja_trace.json + +# 3. Load both traces +with open('ninja_trace.json') as f: + ninja_trace = json.load(f) + +# 4. Validate +from trace_analysis import TraceValidator + +report = TraceValidator.compare_traces(analyzer_trace, ninja_trace) + +# 5. Review results +print(TraceValidator.generate_validation_report(report)) +``` + +### Validation Report Format + +``` +=== Trace Validation Report === + +Overall Status: PASS / FAIL + +Build Statistics: + Total Events: 1,234 (analyzer) vs 1,234 (ninja) ✓ + Total Time: 123.456s (analyzer) vs 123.457s (ninja) ✓ (diff: 0.001s) + +Worker Assignment: + Match Rate: 100% (1,234/1,234 events) ✓ + +Per-File Duration: + Files Checked: 1,234 + Exact Matches: 1,230 (99.7%) + Within Tolerance: 4 (0.3%) + Outside Tolerance: 0 (0.0%) ✓ + +Discrepancies: + file1.o: 1234ms (analyzer) vs 1235ms (ninja) - diff: 1ms (0.08%) + file2.o: 5678ms (analyzer) vs 5677ms (ninja) - diff: 1ms (0.02%) + +Conclusion: Tools are consistent within acceptable tolerance. +``` + +## Implementation Plan + +### Phase 1: Basic Export (Week 1) + +**Deliverables:** +- `trace_analysis/chrome_trace.py` with `export_ninja_timeline()` +- Unit tests for Chrome Trace format +- Integration test comparing with ninja_json_converter + +**Tasks:** +- [ ] Implement ChromeTraceExporter class +- [ ] Add event categorization logic +- [ ] Write unit tests for event generation +- [ ] Test with sample .ninja_log files +- [ ] Verify format matches ninja_json_converter exactly + +**Success Criteria:** +- Exports valid Chrome Trace JSON +- Loads correctly in Perfetto +- Matches ninja_json_converter output for same input + +### Phase 2: Validation Utilities (Week 1-2) + +**Deliverables:** +- `trace_analysis/validation.py` with comparison utilities +- Validation report generator +- Documentation of validation workflow + +**Tasks:** +- [ ] Implement TraceValidator class +- [ ] Add comparison algorithms +- [ ] Create validation report formatter +- [ ] Write tests for validation logic +- [ ] Document expected discrepancies + +**Success Criteria:** +- Accurately identifies discrepancies +- Generates clear validation reports +- Handles edge cases gracefully + +### Phase 3: Template Event Export (Week 2) + +**Deliverables:** +- Template event export in `chrome_trace.py` +- Merged trace generation +- Examples in notebook + +**Tasks:** +- [ ] Implement `export_template_events()` +- [ ] Add timing alignment logic +- [ ] Implement granularity filtering +- [ ] Add merge functionality +- [ ] Test with real -ftime-trace data + +**Success Criteria:** +- Template events align with ninja timeline +- Granularity filtering works correctly +- Merged traces load in Perfetto + +### Phase 4: Documentation & Examples (Week 2-3) + +**Deliverables:** +- Updated README with validation workflow +- Notebook section demonstrating export +- API documentation +- Validation guide + +**Tasks:** +- [ ] Add notebook section for Chrome Trace export +- [ ] Document validation workflow +- [ ] Create troubleshooting guide +- [ ] Add API documentation +- [ ] Write migration guide for ninja_json_converter users + +**Success Criteria:** +- Clear documentation of validation process +- Working examples in notebook +- Users can successfully validate traces + +## Testing Strategy + +### Unit Tests + +```python +# test_chrome_trace.py + +def test_export_ninja_timeline_format(): + """Verify Chrome Trace format is valid.""" + +def test_export_ninja_timeline_compatibility(): + """Verify compatibility with ninja_json_converter.""" + +def test_event_categorization(): + """Verify file extension -> category mapping.""" + +def test_worker_assignment(): + """Verify worker IDs match ninja_json_converter.""" +``` + +### Integration Tests + +```python +# test_validation.py + +def test_compare_identical_traces(): + """Validation passes for identical traces.""" + +def test_detect_discrepancies(): + """Validation detects timing differences.""" + +def test_tolerance_handling(): + """Small differences within tolerance pass.""" +``` + +### Validation Tests + +```python +# test_cross_validation.py + +def test_real_ninja_log(): + """Compare with actual ninja_json_converter output.""" + +def test_large_build(): + """Handle large builds (1000+ files).""" + +def test_incremental_build(): + """Handle incremental build scenarios.""" +``` + +## Usage Examples + +### Basic Export + +```python +from pathlib import Path +from trace_analysis import NinjaLogParser, ChromeTraceExporter +import json + +# Parse ninja log +builds = NinjaLogParser.parse(Path('build/.ninja_log')) +builds_df = NinjaLogParser.to_dataframe(builds) + +# Export to Chrome Trace +trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + +# Save for Perfetto +with open('build_trace.json', 'w') as f: + json.dump(trace_data, f) + +print("Open build_trace.json in chrome://tracing or https://ui.perfetto.dev") +``` + +### Cross-Validation + +```python +from trace_analysis import ChromeTraceExporter, TraceValidator +import json +import subprocess + +# Generate trace from analyze_build +analyzer_trace = ChromeTraceExporter.export_ninja_timeline(builds_df) + +# Generate trace from ninja_json_converter +subprocess.run([ + 'python', 'script/ninja_json_converter.py', + 'build/.ninja_log', + '-o', 'ninja_trace.json' +]) + +# Load ninja_json_converter output +with open('ninja_trace.json') as f: + ninja_trace = json.load(f) + +# Validate +report = TraceValidator.compare_traces(analyzer_trace, ninja_trace) + +# Print report +print(TraceValidator.generate_validation_report(report)) + +# Check if validation passed +if report['total_time_match'] and report['event_count_match']: + print("✓ Validation PASSED - Tools are consistent") +else: + print("✗ Validation FAILED - Discrepancies found") + for disc in report['file_discrepancies']: + print(f" {disc['file']}: {disc['diff_ms']}ms difference") +``` + +### Template Event Export + +```python +from trace_analysis import ( + TraceParser, TraceTransformer, + ChromeTraceExporter, find_trace_files +) + +# Parse -ftime-trace files +trace_files = find_trace_files(Path('build')) +all_events = [] +all_instantiations = [] + +for trace_file in trace_files: + events = TraceParser.parse(trace_file) + schema = TraceTransformer.to_enhanced_schema(events, file_id=0) + all_instantiations.append(schema['instantiations']) + +instantiations_df = pd.concat(all_instantiations, ignore_index=True) + +# Export template events +template_trace = ChromeTraceExporter.export_template_events( + instantiations_df, + templates_df, + builds_df, + granularity_us=50000 # Only events > 50ms +) + +# Merge with ninja timeline +merged_trace = ChromeTraceExporter.merge_traces( + ninja_trace, + template_trace +) + +# Save merged trace +with open('merged_trace.json', 'w') as f: + json.dump(merged_trace, f) +``` + +### Notebook Integration + +```python +# In comprehensive_example.ipynb + +## Chrome Trace Export for Validation + +# Export ninja timeline +from trace_analysis import ChromeTraceExporter +import json + +trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + +# Save trace +with open('../data/analyzer_trace.json', 'w') as f: + json.dump(trace_data, f, indent=2) + +print(f"Exported {len(trace_data['traceEvents'])} events") +print(f"Total build time: {sum(e['dur'] for e in trace_data['traceEvents']) / 1e6:.2f}s") + +# Validate against ninja_json_converter +# (Assuming ninja_trace.json was generated separately) +with open('../data/ninja_trace.json') as f: + ninja_trace = json.load(f) + +from trace_analysis import TraceValidator + +report = TraceValidator.compare_traces(trace_data, ninja_trace) +print(TraceValidator.generate_validation_report(report)) +``` + +## Open Questions + +### Critical Questions + +1. **Data Consistency** + - Q: Do you currently see discrepancies between the tools? + - Q: What tolerance is acceptable? (±1ms suggested) + - Q: Are there known sources of differences? + +2. **Validation Workflow** + - Q: How often do you need to cross-validate? + - Q: Should this be automated in CI? + - Q: What triggers a validation run? + +3. **Template Event Export** + - Q: Should template events be in same file as ninja events? + - Q: Or separate files for different analysis? + - Q: Priority: High, Medium, or Low? + +### Technical Questions + +4. **Output Format** + - Q: Must we match ninja_json_converter format exactly? + - Q: Or can we use enhanced format with metadata? + - Q: Is backward compatibility required? + +5. **Performance** + - Q: What's the largest build to support? + - Q: Number of targets? (hundreds, thousands, tens of thousands?) + - Q: Should we implement sampling for huge builds? + +## Success Metrics + +### Functional Metrics + +- ✅ Exports valid Chrome Trace JSON +- ✅ Loads correctly in Perfetto +- ✅ Matches ninja_json_converter output (within tolerance) +- ✅ Validation detects discrepancies accurately +- ✅ Clear validation reports + +### Quality Metrics + +- ✅ 100% unit test coverage for new modules +- ✅ Integration tests with real data pass +- ✅ Documentation complete and clear +- ✅ Examples work in notebook + +### Performance Metrics + +- ✅ Export completes in < 1s for 1000 files +- ✅ Validation completes in < 5s for 1000 files +- ✅ Memory usage < 100MB for typical builds + +## Future Enhancements + +### Potential Phase 2 Features + +1. **Automated Validation in CI** + - Run validation on every build + - Fail CI if discrepancies exceed threshold + - Track validation metrics over time + +2. **Differential Analysis** + - Compare traces from different builds + - Identify performance regressions + - Track optimization progress + +3. **Enhanced Visualization** + - Plotly timeline charts in notebooks + - Interactive exploration of discrepancies + - Side-by-side comparison views + +4. **Template Optimization Recommendations** + - Correlate slow files with expensive templates + - Suggest optimization targets + - Estimate potential improvements + +## References + +- [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview) +- [Perfetto UI](https://ui.perfetto.dev) +- [Clang -ftime-trace Documentation](https://releases.llvm.org/11.0.0/tools/clang/docs/ClangCommandLineReference.html#cmdoption-clang-ftime-trace) +- [Ninja Build System](https://ninja-build.org/) + +## Appendix A: Chrome Trace Event Format Details + +### Complete Event Structure + +```json +{ + "name": "event_name", + "cat": "category", + "ph": "X", + "ts": 1234567890, + "dur": 5000000, + "pid": 1, + "tid": 3, + "args": { + "custom_field": "value" + } +} +``` + +### Phase Types + +- `X`: Complete event (has duration) +- `B`: Begin event +- `E`: End event +- `i`: Instant event +- `M`: Metadata event + +For build traces, we use `X` (complete events) exclusively. + +### Category Conventions + +Standard categories for build events: + +- `compile`: Compilation of source files (.o, .obj) +- `link_shared`: Shared library linking (.so, .dll, .dylib) +- `link_executable`: Executable linking (.exe, .out) +- `archive`: Static library creation (.a, .lib) +- `test`: Test execution +- `other`: Other build steps + +## Appendix B: Validation Algorithm + +### Comparison Algorithm + +```python +def compare_events(event1, event2, tolerance_ms=1.0): + """Compare two trace events for equivalence.""" + + # Must match exactly + if event1['name'] != event2['name']: + return False, "Name mismatch" + if event1['tid'] != event2['tid']: + return False, "Worker ID mismatch" + if event1['cat'] != event2['cat']: + return False, "Category mismatch" + + # Must match within tolerance + dur1_ms = event1['dur'] / 1000 + dur2_ms = event2['dur'] / 1000 + diff_ms = abs(dur1_ms - dur2_ms) + + if diff_ms > tolerance_ms: + return False, f"Duration mismatch: {diff_ms}ms" + + return True, "Match" +``` + +### Discrepancy Categorization + +**Critical**: Must be fixed +- Total time difference > 1% +- Event count mismatch +- Worker assignment errors + +**Warning**: Should investigate +- Per-file duration > 1ms difference +- Category mismatches +- Timestamp ordering issues + +**Info**: Acceptable +- Floating point precision differences +- Metadata differences +- Event ordering when timestamps identical + +## Appendix C: Migration Guide + +### For ninja_json_converter Users + +If you currently use `ninja_json_converter.py`, you can continue to do so. The new Chrome Trace export in `analyze_build` is complementary, not a replacement. + +**When to use ninja_json_converter:** +- Quick build timeline visualization +- Build system optimization +- CI/CD monitoring +- Multi-build comparison + +**When to use analyze_build Chrome Trace export:** +- Cross-validation with template analysis +- Verifying data consistency +- Debugging discrepancies +- Correlating build and template metrics + +**Using both together:** +```bash +# Generate trace from ninja_json_converter +python script/ninja_json_converter.py build/.ninja_log -o ninja_trace.json + +# Generate trace from analyze_build +python -c " +from pathlib import Path +from trace_analysis import NinjaLogParser, ChromeTraceExporter +import json + +builds = NinjaLogParser.parse(Path('build/.ninja_log')) +builds_df = NinjaLogParser.to_dataframe(builds) +trace = ChromeTraceExporter.export_ninja_timeline(builds_df) + +with open('analyzer_trace.json', 'w') as f: + json.dump(trace, f) +" + +# Compare +python -c " +from trace_analysis import TraceValidator +import json + +with open('ninja_trace.json') as f: + ninja = json.load(f) +with open('analyzer_trace.json') as f: + analyzer = json.load(f) + +report = TraceValidator.compare_traces(analyzer, ninja) +print(TraceValidator.generate_validation_report(report)) +" diff --git a/script/analyze_build/docs/PERFETTO_VISUALIZATION.md b/script/analyze_build/docs/PERFETTO_VISUALIZATION.md new file mode 100644 index 0000000000..fb5eacd7e2 --- /dev/null +++ b/script/analyze_build/docs/PERFETTO_VISUALIZATION.md @@ -0,0 +1,231 @@ +# Perfetto Visualization Guide + +This guide shows how to visualize ninja build timelines in Perfetto UI using the `trace_analysis` library. + +## Quick Start + +### Command Line Usage + +```bash +# Run the example script +python examples/perfetto_visualization_example.py path/to/.ninja_log + +# This will: +# 1. Parse the ninja log +# 2. Assign workers for parallelism visualization +# 3. Export to Chrome Trace format +# 4. Save to build_trace.json +``` + +### Jupyter Notebook Usage + +```python +from pathlib import Path +from trace_analysis import NinjaLogParser, ChromeTraceExporter +from trace_analysis.perfetto_display import display_perfetto, print_trace_summary + +# Parse ninja log +builds = NinjaLogParser.parse(Path('build/.ninja_log')) +builds_df = NinjaLogParser.to_dataframe(builds) +builds_df = NinjaLogParser.assign_workers(builds_df) + +# Export to Chrome Trace format +trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + +# Print summary +print_trace_summary(trace_data) + +# Display in Perfetto UI (embedded in notebook) +display_perfetto(trace_data) + +# Or save to file for large traces +from trace_analysis.perfetto_display import save_and_link +save_and_link(trace_data, '../data/build_trace.json') +``` + +## What You Get + +The Chrome Trace export provides: + +- **Build Timeline**: Visual representation of when each target was built +- **Parallelism Analysis**: See how many workers were active at any time +- **Category Breakdown**: Targets categorized by type (compile, link, archive, etc.) +- **Duration Analysis**: Identify slow compilation units +- **Critical Path**: Understand build dependencies and bottlenecks + +## Viewing in Perfetto UI + +### Option 1: Embedded in Jupyter (Small Traces) + +For traces < 10MB, use `display_perfetto()` to embed directly in the notebook: + +```python +display_perfetto(trace_data, height=600) +``` + +### Option 2: Manual Upload (Large Traces) + +For larger traces, save to file and upload manually: + +```python +ChromeTraceExporter.export_to_file(trace_data, 'build_trace.json') +``` + +Then: +1. Go to https://ui.perfetto.dev +2. Click "Open trace file" +3. Select your `build_trace.json` + +Or drag and drop the file directly into Perfetto UI. + +## DataFrame Schema + +The `builds_df` DataFrame has the following columns: + +| Column | Type | Description | +|--------|------|-------------| +| `target` | str | Build target name (e.g., "obj/foo.o") | +| `start_ms` | int64 | Start time in milliseconds since epoch | +| `end_ms` | int64 | End time in milliseconds since epoch | +| `duration_ms` | int32 | Build duration in milliseconds | +| `cmd_hash` | str | Command hash from ninja | +| `worker_id` | int16 | Assigned worker ID (0-based) | + +### Adding Category Column + +The Chrome Trace exporter automatically categorizes targets based on file extension: + +- `.o`, `.obj` → `compile` +- `.a`, `.lib` → `archive` +- `.so`, `.dll`, `.dylib` → `link_shared` +- `.exe`, `.out` → `link_executable` +- Contains "test" → `test` +- Everything else → `other` + +## Chrome Trace Event Format + +Each build target is exported as a Chrome Trace event: + +```json +{ + "name": "obj/foo.o", + "cat": "compile", + "ph": "X", + "ts": 1234567890000, + "dur": 5000000, + "pid": 1, + "tid": 3, + "args": { + "output": "obj/foo.o", + "duration_ms": 5000, + "cmd_hash": "abc123" + } +} +``` + +## Comparison with ninja_json_converter.py + +The `trace_analysis` library provides similar functionality to `ninja_json_converter.py` but with additional features: + +### Similarities +- Both parse `.ninja_log` files +- Both export to Chrome Trace Event Format +- Both can be viewed in Perfetto UI + +### Differences + +| Feature | ninja_json_converter.py | trace_analysis | +|---------|------------------------|----------------| +| **Primary Use** | Quick build visualization | Integrated analysis workflow | +| **Output** | Chrome Trace JSON only | DataFrames + Chrome Trace | +| **Analysis** | External (Perfetto UI) | In-notebook with pandas | +| **Template Data** | No | Yes (with -ftime-trace) | +| **Worker Assignment** | Built-in algorithm | Same algorithm, exposed as DataFrame | +| **Customization** | Command-line flags | Programmatic API | + +### When to Use Each + +**Use `ninja_json_converter.py` when:** +- You just want a quick visualization +- You're working from the command line +- You don't need further analysis + +**Use `trace_analysis` when:** +- You want to analyze build data with pandas +- You're working in Jupyter notebooks +- You want to correlate build times with template analysis +- You need programmatic access to build data + +## Examples + +### Example 1: Find Slowest Builds + +```python +# Get top 10 slowest builds +slowest = builds_df.nlargest(10, 'duration_ms') +print(slowest[['target', 'duration_ms', 'worker_id']]) +``` + +### Example 2: Analyze Worker Utilization + +```python +worker_stats = NinjaLogParser.compute_worker_stats(builds_df) +print(worker_stats) +``` + +### Example 3: Category Breakdown + +```python +from trace_analysis.perfetto_display import get_trace_summary + +summary = get_trace_summary(trace_data) +print(f"Total events: {summary['event_count']}") +print(f"Total duration: {summary['total_duration_s']:.2f}s") +print(f"Workers: {summary['worker_count']}") +print("\nBy category:") +for cat, count in summary['categories'].items(): + print(f" {cat}: {count} events") +``` + +### Example 4: Export with Custom Process ID + +```python +# Useful when combining multiple build logs +trace_data = ChromeTraceExporter.export_ninja_timeline( + builds_df, + process_id=2, # Use different PID for each log + include_metadata=True +) +``` + +## Troubleshooting + +### Issue: Trace file too large for embedded display + +**Solution**: Use `save_and_link()` instead of `display_perfetto()`: + +```python +save_and_link(trace_data, 'build_trace.json') +``` + +### Issue: Worker IDs all show as -1 + +**Solution**: Make sure to call `assign_workers()`: + +```python +builds_df = NinjaLogParser.assign_workers(builds_df) +``` + +### Issue: Import error for perfetto_display + +**Solution**: The perfetto display functions are in a separate module: + +```python +from trace_analysis.perfetto_display import display_perfetto +``` + +## See Also + +- [CHROME_TRACE_EXPORT.md](CHROME_TRACE_EXPORT.md) - Full design document +- [comprehensive_example.ipynb](../notebooks/comprehensive_example.ipynb) - Complete analysis workflow +- [ninja_json_converter.py](../../ninja_json_converter.py) - Command-line alternative diff --git a/script/analyze_build/examples/perfetto_visualization_example.py b/script/analyze_build/examples/perfetto_visualization_example.py new file mode 100644 index 0000000000..b238c4d7a1 --- /dev/null +++ b/script/analyze_build/examples/perfetto_visualization_example.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Example: Visualizing Build Timeline in Perfetto UI + +This example demonstrates how to: +1. Parse a ninja .ninja_log file +2. Export to Chrome Trace format +3. Display in Perfetto UI (for Jupyter notebooks) +4. Save to file for manual upload + +Usage: + python perfetto_visualization_example.py path/to/.ninja_log +""" + +import sys +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from trace_analysis import NinjaLogParser, ChromeTraceExporter +from trace_analysis.perfetto_display import ( + print_trace_summary, +) + + +def main(): + """Main example function.""" + if len(sys.argv) < 2: + print("Usage: python perfetto_visualization_example.py path/to/.ninja_log") + sys.exit(1) + + ninja_log_path = Path(sys.argv[1]) + + if not ninja_log_path.exists(): + print(f"Error: {ninja_log_path} not found") + sys.exit(1) + + print(f"Parsing {ninja_log_path}...") + + # Step 1: Parse ninja log + builds = NinjaLogParser.parse(ninja_log_path) + builds_df = NinjaLogParser.to_dataframe(builds) + + print(f"Found {len(builds_df):,} build targets") + + # Step 2: Assign workers (for parallelism visualization) + builds_df = NinjaLogParser.assign_workers(builds_df) + + print(f"Assigned {builds_df['worker_id'].max() + 1} workers") + + # Step 3: Export to Chrome Trace format + trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + + print(f"\nGenerated {len(trace_data['traceEvents']):,} trace events") + + # Step 4: Print summary + print_trace_summary(trace_data) + + # Step 5: Save to file + output_path = ninja_log_path.parent / "build_trace.json" + ChromeTraceExporter.export_to_file(trace_data, str(output_path)) + + print(f"\n✓ Trace saved to: {output_path}") + print("\nTo view in Perfetto UI:") + print(" 1. Go to https://ui.perfetto.dev") + print(" 2. Click 'Open trace file'") + print(f" 3. Select: {output_path}") + print("\nOr drag and drop the file directly into Perfetto UI") + + # For Jupyter notebook usage, you would use: + # display_perfetto(trace_data) + # or for large traces: + # save_and_link(trace_data, str(output_path)) + + +if __name__ == "__main__": + main() diff --git a/script/analyze_build/notebooks/comprehensive_example.ipynb b/script/analyze_build/notebooks/comprehensive_example.ipynb index 491e373f07..5db3ebe0b3 100644 --- a/script/analyze_build/notebooks/comprehensive_example.ipynb +++ b/script/analyze_build/notebooks/comprehensive_example.ipynb @@ -105,10 +105,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 1,253 trace files\n", + "Found 1,279 trace files\n", "\n", "Using sample file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n", - "File size: 11652.5 KB\n" + "File size: 10967.8 KB\n" ] } ], @@ -155,14 +155,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Parsed 15,110 events in 0.043s\n", - "Transformed to Pandas tables in 1.115s\n", + "Parsed 14,271 events in 0.058s\n", + "Transformed to Pandas tables in 0.999s\n", "\n", "Pandas DataFrames:\n", - " templates : 8,703 rows, 8.10 MB | template_id, template_name, full_signature, depth, arg_count\n", - " instantiations : 9,838 rows, 0.31 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n", - " template_args : 51,474 rows, 8.73 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n", - " events : 15,110 rows, 0.53 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n" + " templates : 8,156 rows, 7.56 MB | template_id, template_name, full_signature, depth, arg_count\n", + " instantiations : 9,267 rows, 0.29 MB | instantiation_id, template_id, file_id, dur_us, ts_us, event_type\n", + " template_args : 49,097 rows, 8.23 MB | parent_template_id, arg_position, arg_template_id, arg_type, arg_text\n", + " events : 14,271 rows, 0.50 MB | name, dur, ts, pid, tid, ph, ts_absolute_us\n" ] } ], @@ -217,10 +217,10 @@ "text": [ "Compilation Unit Summary:\n", " Trace file: device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp.json\n", - " Trace file size: 11652.5 KB\n", - " Start time: 2026-01-03 22:51:51.980489\n", - " Total compilation time: 178.01s\n", - " Total events: 15,110\n" + " Trace file size: 10967.8 KB\n", + " Start time: 2026-01-04 16:14:25.048921\n", + " Total compilation time: 162.17s\n", + " Total events: 14,271\n" ] } ], @@ -257,16 +257,16 @@ "-------------------------------------------------------------------\n", "template_id int32 0.03 0.4%\n", "template_name category 0.03 0.4%\n", - "full_signature object 8.02 99.0%\n", + "full_signature object 7.49 99.0%\n", "depth int8 0.01 0.1%\n", "arg_count int8 0.01 0.1%\n", "Index RangeIndex 0.00 0.0%\n", "-------------------------------------------------------------------\n", - "TOTAL 8.10 100.0%\n", + "TOTAL 7.56 100.0%\n", "\n", - "Total templates: 8,703\n", - "CK templates: 6,331 (72.7%)\n", - "Other templates: 2,372\n", + "Total templates: 8,156\n", + "CK templates: 6,063 (74.3%)\n", + "Other templates: 2,093\n", "\n", "Sample CK templates:\n" ] @@ -301,26 +301,18 @@ " \n", " \n", " \n", - " 45\n", - " 45\n", - " ck::Tuple\n", - " ck::Tuple<ck::integral_constant<int, 2>, _BitInt(4)>\n", - " 2\n", - " 2\n", - " \n", - " \n", " 46\n", " 46\n", - " ck::Tuple\n", - " ck::Tuple<ck::integral_constant<int, 16>, _BitInt(6)>\n", - " 2\n", + " ck::vector_type\n", + " ck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k...\n", + " 1\n", " 2\n", " \n", " \n", " 47\n", " 47\n", - " ck::Tuple\n", - " ck::Tuple<float, float>\n", + " ck::vector_type\n", + " ck::vector_type<float, 2>\n", " 1\n", " 2\n", " \n", @@ -328,7 +320,7 @@ " 48\n", " 48\n", " ck::vector_type\n", - " ck::vector_type<float, 2>::(unnamed union at /home/AMD/jshumway/composable_k...\n", + " ck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k...\n", " 1\n", " 2\n", " \n", @@ -336,90 +328,98 @@ " 49\n", " 49\n", " ck::vector_type\n", - " ck::vector_type<float, 2>\n", + " ck::vector_type<float, 4>\n", " 1\n", " 2\n", " \n", " \n", " 50\n", " 50\n", - " ck::Tuple\n", - " ck::Tuple<float, float, float, float>\n", - " 1\n", - " 4\n", - " \n", - " \n", - " 51\n", - " 51\n", - " ck::vector_type\n", - " ck::vector_type<float, 4>::(unnamed union at /home/AMD/jshumway/composable_k...\n", - " 1\n", - " 2\n", - " \n", - " \n", - " 52\n", - " 52\n", - " ck::vector_type\n", - " ck::vector_type<float, 4>\n", - " 1\n", - " 2\n", - " \n", - " \n", - " 53\n", - " 53\n", - " ck::Tuple\n", - " ck::Tuple<float, float, float, float, float, float, float, float>\n", - " 1\n", - " 8\n", - " \n", - " \n", - " 54\n", - " 54\n", " ck::detail::TupleElementKeyData\n", " ck::detail::TupleElementKeyData<ck::detail::TupleElementKey<3>, float __attr...\n", " 2\n", " 2\n", " \n", + " \n", + " 51\n", + " 51\n", + " ck::detail::TupleImpl\n", + " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3>, float __attribute__((ext_vec...\n", + " 2\n", + " 5\n", + " \n", + " \n", + " 52\n", + " 52\n", + " ck::Tuple\n", + " ck::Tuple<float __attribute__((ext_vector_type(2))), float __attribute__((ex...\n", + " 1\n", + " 4\n", + " \n", + " \n", + " 53\n", + " 53\n", + " ck::vector_type\n", + " ck::vector_type<float, 8>::(unnamed union at /home/AMD/jshumway/composable_k...\n", + " 1\n", + " 2\n", + " \n", + " \n", + " 54\n", + " 54\n", + " ck::vector_type\n", + " ck::vector_type<float, 8>\n", + " 1\n", + " 2\n", + " \n", + " \n", + " 55\n", + " 55\n", + " ck::Tuple\n", + " ck::Tuple<float, float, float, float, float, float, float, float, float, flo...\n", + " 1\n", + " 16\n", + " \n", " \n", "\n", "" ], "text/plain": [ " template_id template_name \\\n", - "45 45 ck::Tuple \n", - "46 46 ck::Tuple \n", - "47 47 ck::Tuple \n", + "46 46 ck::vector_type \n", + "47 47 ck::vector_type \n", "48 48 ck::vector_type \n", "49 49 ck::vector_type \n", - "50 50 ck::Tuple \n", - "51 51 ck::vector_type \n", - "52 52 ck::vector_type \n", - "53 53 ck::Tuple \n", - "54 54 ck::detail::TupleElementKeyData \n", + "50 50 ck::detail::TupleElementKeyData \n", + "51 51 ck::detail::TupleImpl \n", + "52 52 ck::Tuple \n", + "53 53 ck::vector_type \n", + "54 54 ck::vector_type \n", + "55 55 ck::Tuple \n", "\n", " full_signature \\\n", - "45 ck::Tuple, _BitInt(4)> \n", - "46 ck::Tuple, _BitInt(6)> \n", - "47 ck::Tuple \n", - "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", - "49 ck::vector_type \n", - "50 ck::Tuple \n", - "51 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", - "52 ck::vector_type \n", - "53 ck::Tuple \n", - "54 ck::detail::TupleElementKeyData, float __attr... \n", + "46 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", + "47 ck::vector_type \n", + "48 ck::vector_type::(unnamed union at /home/AMD/jshumway/composable_k... \n", + "49 ck::vector_type \n", + "50 ck::detail::TupleElementKeyData, float __attr... \n", + "51 ck::detail::TupleImpl, float __attribute__((ext_vec... \n", + "52 ck::Tuple::(unnamed union at /home/AMD/jshumway/composable_k... \n", + "54 ck::vector_type \n", + "55 ck::Tuple\n", " \n", " \n", - " 670\n", + " 620\n", " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", " 12\n", - " 444.005\n", - " 37.000417\n", + " 403.014\n", + " 33.584500\n", " 1\n", " \n", " \n", - " 669\n", + " 619\n", " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...\n", " 12\n", - " 442.665\n", - " 36.888750\n", + " 401.766\n", + " 33.480500\n", " 1\n", " \n", " \n", - " 1411\n", + " 1305\n", " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10>::operator()<(lamb...\n", " 34\n", - " 365.628\n", - " 10.753765\n", + " 330.576\n", + " 9.722824\n", " 1\n", " \n", " \n", - " 6424\n", + " 6125\n", " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>::operator...\n", " 2\n", - " 346.742\n", - " 173.371000\n", + " 318.451\n", + " 159.225500\n", " 1\n", " \n", " \n", - " 4079\n", + " 3881\n", " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", " 4\n", - " 330.928\n", - " 82.732000\n", + " 303.042\n", + " 75.760500\n", " 1\n", " \n", " \n", - " 4078\n", + " 3880\n", " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...\n", " 4\n", - " 330.380\n", - " 82.595000\n", + " 302.517\n", + " 75.629250\n", " 1\n", " \n", " \n", - " 4859\n", + " 4631\n", " ck::transform_tensor_descriptor<ck::TensorDescriptor<ck::Tuple<ck::UnMerge<c...\n", " 1\n", - " 327.756\n", - " 327.756000\n", + " 301.674\n", + " 301.674000\n", " 6\n", " \n", " \n", - " 1470\n", - " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", - " 6\n", - " 311.490\n", - " 51.915000\n", - " 1\n", - " \n", - " \n", - " 1469\n", - " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...\n", - " 6\n", - " 310.729\n", - " 51.788167\n", - " 1\n", - " \n", - " \n", - " 2362\n", + " 2217\n", " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", " 5\n", - " 295.890\n", - " 59.178000\n", + " 290.939\n", + " 58.187800\n", + " 1\n", + " \n", + " \n", + " 2216\n", + " ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu...\n", + " 5\n", + " 290.336\n", + " 58.067200\n", + " 1\n", + " \n", + " \n", + " 1364\n", + " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", + " 6\n", + " 284.086\n", + " 47.347667\n", " 1\n", " \n", " \n", @@ -609,28 +609,28 @@ ], "text/plain": [ " signature \\\n", - "670 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n", - "669 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n", - "1411 ck::detail::applier::operator()<(lamb... \n", - "6424 ck::detail::applier::operator... \n", - "4079 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n", - "4078 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n", - "4859 ck::transform_tensor_descriptor::operator()<(lamb... \n", + "6125 ck::detail::applier::operator... \n", + "3881 ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c... \n", + "3880 ck::generate_tuple_for<(lambda at /home/AMD/jshumway/composable_kernel/inclu... \n", + "4631 ck::transform_tensor_descriptor\n", " 0\n", " 1\n", - " 1628\n", - " 8510.435\n", - " 5.227540\n", + " 1564\n", + " 7735.584\n", + " 4.946026\n", " \n", " \n", " 1\n", " 2\n", - " 1962\n", - " 5433.074\n", - " 2.769151\n", + " 1873\n", + " 4943.003\n", + " 2.639083\n", " \n", " \n", " 2\n", " 3\n", - " 1311\n", - " 5042.420\n", - " 3.846240\n", + " 1256\n", + " 4686.911\n", + " 3.731617\n", " \n", " \n", " 3\n", " 4\n", - " 2548\n", - " 7730.989\n", - " 3.034140\n", + " 2336\n", + " 6986.861\n", + " 2.990951\n", " \n", " \n", " 4\n", " 5\n", - " 2112\n", - " 10514.270\n", - " 4.978348\n", + " 1969\n", + " 9640.982\n", + " 4.896385\n", " \n", " \n", " 5\n", " 6\n", - " 257\n", - " 2398.921\n", - " 9.334323\n", + " 252\n", + " 2251.077\n", + " 8.932845\n", " \n", " \n", " 6\n", " 7\n", - " 20\n", - " 21.944\n", - " 1.097200\n", + " 17\n", + " 18.260\n", + " 1.074118\n", " \n", " \n", "\n", "" ], "text/plain": [ - " depth count total_ms mean_ms\n", - "0 1 1628 8510.435 5.227540\n", - "1 2 1962 5433.074 2.769151\n", - "2 3 1311 5042.420 3.846240\n", - "3 4 2548 7730.989 3.034140\n", - "4 5 2112 10514.270 4.978348\n", - "5 6 257 2398.921 9.334323\n", - "6 7 20 21.944 1.097200" + " depth count total_ms mean_ms\n", + "0 1 1564 7735.584 4.946026\n", + "1 2 1873 4943.003 2.639083\n", + "2 3 1256 4686.911 3.731617\n", + "3 4 2336 6986.861 2.990951\n", + "4 5 1969 9640.982 4.896385\n", + "5 6 252 2251.077 8.932845\n", + "6 7 17 18.260 1.074118" ] }, "metadata": {}, @@ -882,8 +882,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 1,253 trace files\n", - "Total size: 18.79 GB\n" + "Found 1,279 trace files\n", + "Total size: 17.20 GB\n" ] } ], @@ -919,7 +919,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing 1,253 files with 384 workers...\n", + "Processing 1,279 files with 384 workers...\n", "\n" ] }, @@ -929,7 +929,7 @@ "text": [ "/home/AMD/jshumway/composable_kernel/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", - "Processing: 100%|██████████| 1253/1253 [00:46<00:00, 27.13files/s] \n" + "Processing: 100%|██████████| 1279/1279 [00:45<00:00, 28.08files/s] \n" ] }, { @@ -937,19 +937,19 @@ "output_type": "stream", "text": [ "\n", - "Parsing complete in 48.54s (25.8 files/sec)\n", + "Parsing complete in 54.47s (23.5 files/sec)\n", "\n", "Combining results...\n", - "Combined in 4.73s\n", + "Combined in 3.85s\n", "\n", - "Total analysis time: 53.27s\n", + "Total analysis time: 58.32s\n", "\n", "Combined Tables:\n", - " Templates: 11,751,350 rows\n", - " Instantiations: 17,437,000 rows\n", - " Template Args: 155,281,743 rows\n", - " Events: 24,229,252 rows\n", - " Total memory: 34.06 GB\n" + " Templates: 10,655,045 rows\n", + " Instantiations: 15,806,599 rows\n", + " Template Args: 144,341,805 rows\n", + " Events: 22,196,001 rows\n", + " Total memory: 31.69 GB\n" ] } ], @@ -1067,12 +1067,12 @@ "================================================================================\n", "BUILD-WIDE STATISTICS\n", "================================================================================\n", - "Files processed: 1,253\n", - "Total events: 24,229,252\n", - "Total build time: 5915.00 minutes\n", - "Unique templates: 11,751,350\n", - "Template instantiations: 17,437,000\n", - "Template time: 2495.47 minutes (42.2%)\n", + "Files processed: 1,279\n", + "Total events: 22,196,001\n", + "Total build time: 6935.67 minutes\n", + "Unique templates: 10,655,045\n", + "Template instantiations: 15,806,599\n", + "Template time: 2007.82 minutes (28.9%)\n", "================================================================================\n" ] } @@ -1117,7 +1117,7 @@ "output_type": "stream", "text": [ "Aggregating template statistics...\n", - "Completed in 3.42s\n", + "Completed in 3.10s\n", "\n", "Top 20 Templates by Total Time:\n" ] @@ -1154,275 +1154,253 @@ " \n", " \n", " \n", - " 11751348\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 2\n", - " 135.700625\n", - " 67850.312500\n", - " 67850.3125\n", - " 2\n", - " 0.090631\n", - " \n", - " \n", - " 9135552\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.088275\n", - " \n", - " \n", - " 9135553\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.088275\n", - " \n", - " \n", - " 9135554\n", - " std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConvNdBwdDa...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 4\n", - " 0.088275\n", - " \n", - " \n", - " 9135555\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 3\n", - " 0.088275\n", - " \n", - " \n", - " 9135556\n", - " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 2\n", - " 0.088275\n", - " \n", - " \n", - " 9135557\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.088275\n", - " \n", - " \n", - " 9135558\n", - " std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 4\n", - " 0.088275\n", - " \n", - " \n", - " 9135559\n", - " std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 4\n", - " 0.088275\n", - " \n", - " \n", - " 9135560\n", + " 7340718\n", " std::__uniq_ptr_impl<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 4\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135561\n", - " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340719\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 6\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135562\n", - " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.088275\n", - " \n", - " \n", - " 9135563\n", - " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340720\n", + " std::_TupleConstraints<true, ck::tensor_operation::device::DeviceConv2dBwdDa...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 4\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135564\n", - " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 5\n", - " 0.088275\n", + " 7340721\n", + " ck::kernel_batched_elementwise<ck::GridwiseElementwise<ck::Tuple<ck::TensorD...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 8\n", + " 0.08737\n", " \n", " \n", - " 9135565\n", + " 7340722\n", " std::tuple<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Input_N_Hi_W...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 4\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135566\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.088275\n", + " 7340723\n", + " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceConv2dBwdDataXdl_In...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 4\n", + " 0.08737\n", " \n", " \n", - " 9135567\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.088275\n", + " 7340724\n", + " std::__and_<std::__and_<std::is_constructible<ck::tensor_operation::device::...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.08737\n", " \n", " \n", - " 9135568\n", - " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340725\n", + " ck::GridwiseGemm_xdl_cshuffle_conv_v3<ck::tensor_layout::gemm::RowMajor, ck:...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.08737\n", + " \n", + " \n", + " 7340726\n", + " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 1\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135569\n", - " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340727\n", + " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::Embed<ck...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 5\n", - " 0.088275\n", + " 0.08737\n", " \n", " \n", - " 9135570\n", - " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 3\n", - " 0.088275\n", + " 7340728\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340729\n", + " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340730\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340731\n", + " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, _Float16, 8,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.08737\n", + " \n", + " \n", + " 7340732\n", + " ck::make_tensor_coordinate_step(const ck::TensorDescriptor<ck::Tuple<ck::Emb...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340733\n", + " ck::make_tensor_coordinate<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tupl...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.08737\n", + " \n", + " \n", + " 7340734\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340735\n", + " std::__and_<std::is_assignable<ck::tensor_operation::device::DeviceGroupedCo...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 4\n", + " 0.08737\n", + " \n", + " \n", + " 7340736\n", + " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.08737\n", + " \n", + " \n", + " 7340737\n", + " std::__format::_Formatting_scanner<std::__format::_Sink_iter<char>, char>::_...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 2\n", + " 0.08737\n", " \n", " \n", "\n", "" ], "text/plain": [ - " full_signature \\\n", - "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", - "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", - "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", - "9135554 std::_TupleConstraints, ck::Sequence<0, 1>>::operator... \n", - "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", - "9135558 std::get<1UL, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_... \n", - "9135559 std::_Tuple_impl<0, ck::tensor_operation::device::DeviceConv2dBwdDataXdl_Inp... \n", - "9135560 std::__uniq_ptr_impl,... \n", - "9135565 std::tuple, ck::tensor... \n", - "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", - "9135568 ck::StaticBufferTupleOfVector, ck::tensor_... \n", + "7340720 std::_TupleConstraints,... \n", + "7340729 ck::TensorDescriptor,... \n", + "7340731 ck::StaticTensorTupleOfVectorBuffer,... \n", + "7340735 std::__and_, char>::_... \n", "\n", - " count total_s mean_ms median_ms depth \\\n", - "11751348 2 135.700625 67850.312500 67850.3125 2 \n", - "9135552 846 132.173182 156.233076 1.0550 6 \n", - "9135553 846 132.173182 156.233076 1.0550 7 \n", - "9135554 846 132.173182 156.233076 1.0550 4 \n", - "9135555 846 132.173182 156.233076 1.0550 3 \n", - "9135556 846 132.173182 156.233076 1.0550 2 \n", - "9135557 846 132.173182 156.233076 1.0550 6 \n", - "9135558 846 132.173182 156.233076 1.0550 4 \n", - "9135559 846 132.173182 156.233076 1.0550 4 \n", - "9135560 846 132.173182 156.233076 1.0550 4 \n", - "9135561 846 132.173182 156.233076 1.0550 6 \n", - "9135562 846 132.173182 156.233076 1.0550 6 \n", - "9135563 846 132.173182 156.233076 1.0550 4 \n", - "9135564 846 132.173182 156.233076 1.0550 5 \n", - "9135565 846 132.173182 156.233076 1.0550 4 \n", - "9135566 846 132.173182 156.233076 1.0550 7 \n", - "9135567 846 132.173182 156.233076 1.0550 7 \n", - "9135568 846 132.173182 156.233076 1.0550 1 \n", - "9135569 846 132.173182 156.233076 1.0550 5 \n", - "9135570 846 132.173182 156.233076 1.0550 3 \n", - "\n", - " pct_template_time \n", - "11751348 0.090631 \n", - "9135552 0.088275 \n", - "9135553 0.088275 \n", - "9135554 0.088275 \n", - "9135555 0.088275 \n", - "9135556 0.088275 \n", - "9135557 0.088275 \n", - "9135558 0.088275 \n", - "9135559 0.088275 \n", - "9135560 0.088275 \n", - "9135561 0.088275 \n", - "9135562 0.088275 \n", - "9135563 0.088275 \n", - "9135564 0.088275 \n", - "9135565 0.088275 \n", - "9135566 0.088275 \n", - "9135567 0.088275 \n", - "9135568 0.088275 \n", - "9135569 0.088275 \n", - "9135570 0.088275 " + " count total_s mean_ms median_ms depth pct_template_time \n", + "7340718 984 105.25365 106.965091 2.2285 4 0.08737 \n", + "7340719 984 105.25365 106.965091 2.2285 6 0.08737 \n", + "7340720 984 105.25365 106.965091 2.2285 4 0.08737 \n", + "7340721 984 105.25365 106.965091 2.2285 8 0.08737 \n", + "7340722 984 105.25365 106.965091 2.2285 4 0.08737 \n", + "7340723 984 105.25365 106.965091 2.2285 4 0.08737 \n", + "7340724 984 105.25365 106.965091 2.2285 6 0.08737 \n", + "7340725 984 105.25365 106.965091 2.2285 6 0.08737 \n", + "7340726 984 105.25365 106.965091 2.2285 1 0.08737 \n", + "7340727 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340728 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340729 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340730 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340731 984 105.25365 106.965091 2.2285 6 0.08737 \n", + "7340732 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340733 984 105.25365 106.965091 2.2285 6 0.08737 \n", + "7340734 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340735 984 105.25365 106.965091 2.2285 4 0.08737 \n", + "7340736 984 105.25365 106.965091 2.2285 5 0.08737 \n", + "7340737 984 105.25365 106.965091 2.2285 2 0.08737 " ] }, "metadata": {}, @@ -1504,9 +1482,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Filtered to 9,433,574 CK templates (from 11,751,350 total)\n", - "CK template time: 66204864.83s\n", - "Percentage of total template time: 44216.7%\n", + "Filtered to 8,644,580 CK templates (from 10,655,045 total)\n", + "CK template time: 54849416.17s\n", + "Percentage of total template time: 45529.9%\n", "\n", "Top 20 CK Templates by Total Time:\n" ] @@ -1543,253 +1521,253 @@ " \n", " \n", " \n", - " 11751348\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 2\n", - " 135.700625\n", - " 67850.312500\n", - " 67850.3125\n", - " 2\n", - " 0.000205\n", - " \n", - " \n", - " 9135552\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340719\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 6\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", - " 9135553\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.000200\n", + " 7340721\n", + " ck::kernel_batched_elementwise<ck::GridwiseElementwise<ck::Tuple<ck::TensorD...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 8\n", + " 0.000192\n", " \n", " \n", - " 9135555\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 3\n", - " 0.000200\n", - " \n", - " \n", - " 9135556\n", - " ck::detail::static_ford_impl<ck::Sequence<16>, ck::Sequence<0, 1>>::operator...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 2\n", - " 0.000200\n", - " \n", - " \n", - " 9135557\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<1, 16>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340725\n", + " ck::GridwiseGemm_xdl_cshuffle_conv_v3<ck::tensor_layout::gemm::RowMajor, ck:...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 6\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", - " 9135561\n", - " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.000200\n", + " 7340726\n", + " ck::detail::applier<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 1\n", + " 0.000192\n", " \n", " \n", - " 9135562\n", - " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, unsigned sho...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.000200\n", + " 7340727\n", + " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::Embed<ck...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", " \n", " \n", - " 9135564\n", + " 7340728\n", " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 5\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", - " 9135566\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 4>, ck::tensor...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.000200\n", + " 7340729\n", + " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", " \n", " \n", - " 9135567\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 4>, ck::tensor_...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 7\n", - " 0.000200\n", + " 7340730\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", " \n", " \n", - " 9135568\n", - " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 64, 1, t...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 1\n", - " 0.000200\n", + " 7340731\n", + " ck::StaticTensorTupleOfVectorBuffer<ck::AddressSpaceEnum::Vgpr, _Float16, 8,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.000192\n", " \n", " \n", - " 9135569\n", + " 7340732\n", + " ck::make_tensor_coordinate_step(const ck::TensorDescriptor<ck::Tuple<ck::Emb...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", + " \n", + " \n", + " 7340733\n", + " ck::make_tensor_coordinate<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tupl...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.000192\n", + " \n", + " \n", + " 7340734\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", + " \n", + " \n", + " 7340736\n", + " ck::TensorDescriptor<ck::Tuple<ck::UnMerge<ck::Tuple<ck::integral_constant<i...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 5\n", + " 0.000192\n", + " \n", + " \n", + " 7340739\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.000192\n", + " \n", + " \n", + " 7340740\n", + " ck::BlockwiseGemmXdlops_pipeline_base<64, unsigned short, unsigned short, un...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.000192\n", + " \n", + " \n", + " 7340741\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<8, 8>, ck::tensor_...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 6\n", + " 0.000192\n", + " \n", + " \n", + " 7340744\n", + " ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<2, ck:...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", + " 2\n", + " 0.000192\n", + " \n", + " \n", + " 7340745\n", " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 5\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", - " 9135571\n", - " ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 2\n", - " 0.000200\n", - " \n", - " \n", - " 9135577\n", - " ck::make_tensor_coordinate(const ck::TensorDescriptor<ck::Tuple<ck::UnMerge<...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340746\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<int, int, int, int, int>,...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 5\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", - " 9135579\n", - " ck::Tuple<ck::vector_type<float, 1>, ck::vector_type<float, 1>, ck::vector_t...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 2\n", - " 0.000200\n", - " \n", - " \n", - " 9135582\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", + " 7340747\n", + " ck::sequence_gen<2, ck::lambda_get_up_dim_num<ck::Tuple<ck::UnMerge<ck::Tupl...\n", + " 984\n", + " 105.25365\n", + " 106.965091\n", + " 2.2285\n", " 6\n", - " 0.000200\n", - " \n", - " \n", - " 9135583\n", - " ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5>\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 1\n", - " 0.000200\n", - " \n", - " \n", - " 9135584\n", - " ck::to_multi_index<ck::Sequence<4, 12>>\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 2\n", - " 0.000200\n", - " \n", - " \n", - " 9135585\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<16, 16>, ck::tenso...\n", - " 846\n", - " 132.173182\n", - " 156.233076\n", - " 1.0550\n", - " 6\n", - " 0.000200\n", + " 0.000192\n", " \n", " \n", "\n", "" ], "text/plain": [ - " full_signature \\\n", - "11751348 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", - "9135552 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", - "9135553 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", - "9135555 ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V... \n", - "9135556 ck::detail::static_ford_impl, ck::Sequence<0, 1>>::operator... \n", - "9135557 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", - "9135561 ck::StaticTensorTupleOfVectorBuffer,... \n", - "9135566 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", - "9135567 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", - "9135568 ck::StaticBufferTupleOfVector, ck::vector_type, ck::vector_t... \n", - "9135582 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n", - "9135583 ck::Sequence<5, 6, 7, 8, 9, 10>::ReorderGivenNew2Old<0, 1, 2, 3, 4, 5> \n", - "9135584 ck::to_multi_index> \n", - "9135585 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tenso... \n", + " full_signature \\\n", + "7340719 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "7340721 ck::kernel_batched_elementwise,... \n", + "7340729 ck::TensorDescriptor,... \n", + "7340731 ck::StaticTensorTupleOfVectorBuffer,... \n", + "7340736 ck::TensorDescriptor, ck::tensor_... \n", + "7340740 ck::BlockwiseGemmXdlops_pipeline_base<64, unsigned short, unsigned short, un... \n", + "7340741 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor_... \n", + "7340744 ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<2, ck:... \n", + "7340745 ck::TensorDescriptor,... \n", + "7340747 ck::sequence_gen<2, ck::lambda_get_up_dim_num\n", " \n", " \n", - " 7243825\n", - " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 3\n", - " \n", - " \n", - " 7243826\n", - " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 3\n", - " \n", - " \n", - " 7243827\n", - " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 6304619\n", + " ck_tile::impl::run_cast_from_f8<_BitInt(8), float, true>\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 1\n", " \n", " \n", - " 7243828\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 6304620\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 6\n", " \n", " \n", - " 7243829\n", - " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 5\n", + " 6304621\n", + " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7>, ck::Sequence<19>...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 2\n", " \n", " \n", - " 7243830\n", - " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 4\n", + " 6304622\n", + " ck::operator<<(std::ostream &, const TensorDescriptor<Tuple<Embed<Tuple<int,...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 7\n", " \n", " \n", - " 7243831\n", - " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 4\n", - " \n", - " \n", - " 7243832\n", + " 6304623\n", " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 5\n", " \n", " \n", - " 7243833\n", - " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 1\n", - " \n", - " \n", - " 7243834\n", - " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 6304624\n", + " ck::sequence_gen<8, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<int>...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 4\n", " \n", " \n", - " 7243835\n", - " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 5\n", - " \n", - " \n", - " 7243836\n", + " 6304625\n", " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 3\n", " \n", " \n", - " 7243837\n", - " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 5\n", - " \n", - " \n", - " 7243838\n", - " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 4\n", - " \n", - " \n", - " 7243839\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 5\n", - " \n", - " \n", - " 7243840\n", - " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 4\n", - " \n", - " \n", - " 7243841\n", - " std::pair<const ck::BlockGemmPipelineScheduler, std::basic_string<char>>::pa...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 6304626\n", + " ck::container_concat<ck::Sequence<0>, ck::Sequence<2, 1>, ck::Sequence<3>, c...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 2\n", " \n", " \n", - " 7243842\n", - " ck::make_tuple<ck::integral_constant<int, 1>, ck::integral_constant<int, 4>,...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 2\n", - " \n", - " \n", - " 7243843\n", - " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, unsigned short, 64...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", + " 6304627\n", + " ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4>\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", " 1\n", " \n", " \n", - " 7243844\n", - " std::is_nothrow_constructible<std::unique_ptr<ck::tensor_operation::device::...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 4\n", + " 6304628\n", + " ck::make_tuple<ck::Sequence<0>, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 2\n", + " \n", + " \n", + " 6304629\n", + " std::__and_<std::is_convertible<ck::tensor_operation::device::DeviceGroupedC...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304630\n", + " ck::detail::unpack_impl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 3\n", + " \n", + " \n", + " 6304631\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304632\n", + " ck::sequence_gen<3, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<ck::...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 6\n", + " \n", + " \n", + " 6304633\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304634\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304635\n", + " ck::sequence_gen<2, ck::lambda_get_up_dim_num<ck::Tuple<ck::Xor<ck::Tuple<ck...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 6\n", + " \n", + " \n", + " 6304636\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304637\n", + " ck::make_tensor_coordinate_step<ck::TensorDescriptor<ck::Tuple<ck::Embed<ck:...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", + " \n", + " \n", + " 6304638\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 5\n", " \n", " \n", "\n", @@ -2048,48 +2026,48 @@ ], "text/plain": [ " full_signature \\\n", - "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n", - "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n", - "7243829 ck::utility::launch_and_time_kernel_with_preprocess>::pa... \n", - "7243842 ck::make_tuple, ck::integral_constant,... \n", - "7243843 ck::StaticBufferTupleOfVector \n", + "6304620 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "6304621 ck::detail::TupleImpl, ck::Sequence<19>... \n", + "6304622 ck::operator<<(std::ostream &, const TensorDescriptor... \n", + "6304625 std::unique_ptr, ck::Sequence<2, 1>, ck::Sequence<3>, c... \n", + "6304627 ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> \n", + "6304628 ck::make_tuple, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... \n", + "6304629 std::__and_\n", " \n", " \n", - " 7243825\n", - " ck::Tuple<ck::PassThrough<int>, ck::UnMerge<ck::Tuple<int, int>, false>, ck:...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304619\n", + " ck_tile::impl::run_cast_from_f8<_BitInt(8), float, true>\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243826\n", - " std::__is_implicitly_default_constructible<ck::tensor_operation::device::Dev...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304620\n", + " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<4, 16>, ck::tensor...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243827\n", - " ck::StaticBufferTupleOfVector<ck::AddressSpaceEnum::Vgpr, _Float16, 4, 1, tr...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304621\n", + " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7>, ck::Sequence<19>...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243828\n", - " ck::ThreadwiseTensorSliceTransfer_v3r2<const ck::Sequence<2, 2>, ck::tensor_...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304622\n", + " ck::operator<<(std::ostream &, const TensorDescriptor<Tuple<Embed<Tuple<int,...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243829\n", - " ck::utility::launch_and_time_kernel_with_preprocess<false, ck::GridwiseGemmM...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", - " \n", - " \n", - " 7243830\n", - " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", - " \n", - " \n", - " 7243831\n", - " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", - " \n", - " \n", - " 7243832\n", + " 6304623\n", " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243833\n", - " ck::generate_tuple<(lambda at /home/AMD/jshumway/composable_kernel/include/c...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304624\n", + " ck::sequence_gen<8, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<int>...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243834\n", - " std::__uniq_ptr_data<ck::tensor_operation::device::DeviceGroupedConvFwdMulti...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", - " \n", - " \n", - " 7243835\n", - " std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<const ck::...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", - " \n", - " \n", - " 7243836\n", + " 6304625\n", " std::unique_ptr<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleAB...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243837\n", - " std::__and_<std::__is_implicitly_default_constructible<ck::tensor_operation:...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304626\n", + " ck::container_concat<ck::Sequence<0>, ck::Sequence<2, 1>, ck::Sequence<3>, c...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243838\n", - " std::tuple<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304627\n", + " ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4>\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", - " 7243839\n", - " ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V...\n", - " 9406\n", - " 27.985313\n", - " 2.975262\n", - " 0.500016\n", + " 6304628\n", + " ck::make_tuple<ck::Sequence<0>, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", + " \n", + " \n", + " 6304629\n", + " std::__and_<std::is_convertible<ck::tensor_operation::device::DeviceGroupedC...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", + " \n", + " \n", + " 6304630\n", + " ck::detail::unpack_impl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", + " \n", + " \n", + " 6304631\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", + " \n", + " \n", + " 6304632\n", + " ck::sequence_gen<3, ck::lambda_get_up_dim_num<ck::Tuple<ck::PassThrough<ck::...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", + " \n", + " \n", + " 6304633\n", + " ck::TensorDescriptor<ck::Tuple<ck::Embed<ck::Tuple<ck::integral_constant<int...\n", + " 9773\n", + " 23.119388\n", + " 2.365639\n", + " 0.500023\n", " \n", " \n", "\n", @@ -2286,38 +2264,38 @@ ], "text/plain": [ " full_signature \\\n", - "7243825 ck::Tuple, ck::UnMerge, false>, ck:... \n", - "7243826 std::__is_implicitly_default_constructible, ck::tensor_... \n", - "7243829 ck::utility::launch_and_time_kernel_with_preprocess \n", + "6304620 ck::ThreadwiseTensorSliceTransfer_v3r2, ck::tensor... \n", + "6304621 ck::detail::TupleImpl, ck::Sequence<19>... \n", + "6304622 ck::operator<<(std::ostream &, const TensorDescriptor... \n", + "6304625 std::unique_ptr, ck::Sequence<2, 1>, ck::Sequence<3>, c... \n", + "6304627 ck::Sequence<19, 20, 21, 22, 23>::ReorderGivenNew2Old<0, 1, 2, 3, 4> \n", + "6304628 ck::make_tuple, ck::Sequence<>, ck::Sequence<1>, ck::Sequenc... \n", + "6304629 std::__and_\n", " 0\n", " 0\n", - " 348050\n", - " 2.759851e+03\n", - " 8.315829\n", - " 1.22725\n", - " 1.843240\n", + " 67779\n", + " 3.204520e+02\n", + " 5.207827\n", + " 1.19350\n", + " 0.266004\n", " \n", " \n", " 1\n", " 1\n", - " 2007036528\n", - " 9.720831e+06\n", - " 6.516100\n", - " 1.22500\n", - " 6492.316278\n", + " 1749956922\n", + " 7.353030e+06\n", + " 5.587812\n", + " 1.16350\n", + " 6103.673137\n", " \n", " \n", " 2\n", " 2\n", - " 4212093106\n", - " 2.378498e+07\n", - " 7.811773\n", - " 1.28000\n", - " 15885.434415\n", + " 3669923724\n", + " 1.799595e+07\n", + " 6.909845\n", + " 1.26400\n", + " 14938.245952\n", " \n", " \n", " 3\n", " 3\n", - " 2137634038\n", - " 1.261997e+07\n", - " 7.344963\n", - " 1.27700\n", - " 8428.586130\n", + " 2019427209\n", + " 1.064961e+07\n", + " 6.790369\n", + " 1.26300\n", + " 8840.127967\n", " \n", " \n", " 4\n", " 4\n", - " 2777069926\n", - " 1.801647e+07\n", - " 8.056019\n", - " 1.29000\n", - " 12032.783621\n", + " 2442047025\n", + " 1.470272e+07\n", + " 7.563676\n", + " 1.27750\n", + " 12204.571997\n", " \n", " \n", " 5\n", " 5\n", - " 1651653946\n", - " 1.169701e+07\n", - " 8.960966\n", - " 1.31100\n", - " 7812.159446\n", + " 1644211492\n", + " 1.041237e+07\n", + " 8.437405\n", + " 1.27700\n", + " 8643.197486\n", " \n", " \n", " 6\n", " 6\n", - " 781689120\n", - " 8.147449e+06\n", - " 16.840407\n", - " 1.76800\n", - " 5441.491328\n", + " 756309412\n", + " 6.665863e+06\n", + " 15.362248\n", + " 1.70650\n", + " 5533.262772\n", " \n", " \n", " 7\n", " 7\n", - " 69349065\n", - " 7.653282e+05\n", - " 16.927265\n", - " 1.79050\n", - " 511.144843\n", + " 59804841\n", + " 5.608413e+05\n", + " 15.135226\n", + " 1.73000\n", + " 465.548483\n", " \n", " \n", " 8\n", " 8\n", - " 1516615\n", - " 1.622227e+04\n", - " 16.672371\n", - " 1.79625\n", - " 10.834476\n", - " \n", - " \n", - " 9\n", - " 9\n", - " 2150\n", - " 1.411392e+01\n", - " 7.276202\n", - " 2.63050\n", - " 0.009426\n", + " 687425\n", + " 6.183116e+03\n", + " 14.874348\n", + " 1.76475\n", + " 5.132540\n", " \n", " \n", "\n", @@ -2494,16 +2463,15 @@ ], "text/plain": [ " depth count total_s mean_ms median_ms pct_total\n", - "0 0 348050 2.759851e+03 8.315829 1.22725 1.843240\n", - "1 1 2007036528 9.720831e+06 6.516100 1.22500 6492.316278\n", - "2 2 4212093106 2.378498e+07 7.811773 1.28000 15885.434415\n", - "3 3 2137634038 1.261997e+07 7.344963 1.27700 8428.586130\n", - "4 4 2777069926 1.801647e+07 8.056019 1.29000 12032.783621\n", - "5 5 1651653946 1.169701e+07 8.960966 1.31100 7812.159446\n", - "6 6 781689120 8.147449e+06 16.840407 1.76800 5441.491328\n", - "7 7 69349065 7.653282e+05 16.927265 1.79050 511.144843\n", - "8 8 1516615 1.622227e+04 16.672371 1.79625 10.834476\n", - "9 9 2150 1.411392e+01 7.276202 2.63050 0.009426" + "0 0 67779 3.204520e+02 5.207827 1.19350 0.266004\n", + "1 1 1749956922 7.353030e+06 5.587812 1.16350 6103.673137\n", + "2 2 3669923724 1.799595e+07 6.909845 1.26400 14938.245952\n", + "3 3 2019427209 1.064961e+07 6.790369 1.26300 8840.127967\n", + "4 4 2442047025 1.470272e+07 7.563676 1.27750 12204.571997\n", + "5 5 1644211492 1.041237e+07 8.437405 1.27700 8643.197486\n", + "6 6 756309412 6.665863e+06 15.362248 1.70650 5533.262772\n", + "7 7 59804841 5.608413e+05 15.135226 1.73000 465.548483\n", + "8 8 687425 6.183116e+03 14.874348 1.76475 5.132540" ] }, "metadata": {}, @@ -2571,9 +2539,9 @@ "Template Argument Type Distribution:\n", "Type Count Percentage\n", "----------------------------------------\n", - "template 125,537,546 80.8%\n", - "primitive 19,045,151 12.3%\n", - "unknown 10,699,046 6.9%\n", + "template 116,601,482 80.8%\n", + "primitive 17,641,607 12.2%\n", + "unknown 10,098,716 7.0%\n", "\n", "Templates with Most Arguments:\n" ] @@ -2605,54 +2573,54 @@ " \n", " \n", " \n", - " 231420\n", + " 223300\n", + " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...\n", + " 33367\n", + " \n", + " \n", + " 223301\n", + " ck::vector_type<ck::bf8_fnuz_t, 8>::(unnamed union at /home/AMD/jshumway/com...\n", + " 33367\n", + " \n", + " \n", + " 223302\n", + " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j...\n", + " 33367\n", + " \n", + " \n", + " 223303\n", " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8...\n", - " 23716\n", + " 33367\n", " \n", " \n", - " 231421\n", - " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...\n", - " 23716\n", + " 223304\n", + " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 4>, ck::non_native_vecto...\n", + " 33367\n", " \n", " \n", - " 231422\n", - " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...\n", - " 23716\n", + " 223305\n", + " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 2>, ck::non_native_vecto...\n", + " 33367\n", " \n", " \n", - " 231423\n", - " ck::Tuple<_Float16 __attribute__((ext_vector_type(2))), _Float16 __attribute...\n", - " 23716\n", + " 223306\n", + " ck::non_native_vector_base<ck::f8_fnuz_t, 64>\n", + " 33367\n", " \n", " \n", - " 231424\n", - " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j...\n", - " 23716\n", + " 223307\n", + " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8...\n", + " 33367\n", " \n", " \n", - " 231425\n", - " ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float...\n", - " 23716\n", + " 223308\n", + " ck::Tuple<ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8_fnuz_t, ck::f8...\n", + " 33367\n", " \n", " \n", - " 231426\n", - " ck::non_native_vector_base<ck::f8_fnuz_t, 64>::(unnamed union at /home/AMD/j...\n", - " 23716\n", - " \n", - " \n", - " 231427\n", - " ck::detail::TupleImpl<ck::Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...\n", - " 23716\n", - " \n", - " \n", - " 231428\n", - " ck::vector_type<_Float16, 32>::(unnamed union at /home/AMD/jshumway/composab...\n", - " 23716\n", - " \n", - " \n", - " 231429\n", - " ck::vector_type<signed char, 4>::(unnamed union at /home/AMD/jshumway/compos...\n", - " 23716\n", + " 223309\n", + " ck::Tuple<ck::non_native_vector_base<ck::f8_fnuz_t, 2>, ck::non_native_vecto...\n", + " 33367\n", " \n", " \n", "\n", @@ -2660,28 +2628,28 @@ ], "text/plain": [ " full_signature \\\n", - "231420 ck::Tuple::(unnamed union at /home/AMD/j... \n", - "231425 ck::Tuple<_Float16, _Float16, _Float16, _Float16, _Float16, _Float16, _Float... \n", - "231426 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n", - "231427 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/composab... \n", - "231429 ck::vector_type::(unnamed union at /home/AMD/jshumway/compos... \n", + "223300 ck::detail::TupleImpl::(unnamed union at /home/AMD/jshumway/com... \n", + "223302 ck::non_native_vector_base::(unnamed union at /home/AMD/j... \n", + "223303 ck::Tuple, ck::non_native_vecto... \n", + "223305 ck::Tuple, ck::non_native_vecto... \n", + "223306 ck::non_native_vector_base \n", + "223307 ck::Tuple, ck::non_native_vecto... \n", "\n", " arg_count \n", - "231420 23716 \n", - "231421 23716 \n", - "231422 23716 \n", - "231423 23716 \n", - "231424 23716 \n", - "231425 23716 \n", - "231426 23716 \n", - "231427 23716 \n", - "231428 23716 \n", - "231429 23716 " + "223300 33367 \n", + "223301 33367 \n", + "223302 33367 \n", + "223303 33367 \n", + "223304 33367 \n", + "223305 33367 \n", + "223306 33367 \n", + "223307 33367 \n", + "223308 33367 \n", + "223309 33367 " ] }, "metadata": {}, @@ -2764,164 +2732,164 @@ " \n", " \n", " \n", - " 47\n", + " 48\n", " InstantiateFunction\n", - " 15006405\n", - " 2069.145884\n", - " 8.273051\n", - " 34.981333\n", + " 13837259\n", + " 1669.984159\n", + " 7.241250\n", + " 24.078188\n", " \n", " \n", - " 30\n", + " 31\n", " ExecuteCompiler\n", - " 1251\n", - " 445.214199\n", - " 21353.198979\n", - " 7.526867\n", + " 1276\n", + " 405.041858\n", + " 19045.855415\n", + " 5.839980\n", " \n", " \n", - " 143\n", + " 145\n", " Total ExecuteCompiler\n", - " 1251\n", - " 445.214188\n", - " 21353.198487\n", - " 7.526867\n", + " 1276\n", + " 405.041848\n", + " 19045.854932\n", + " 5.839980\n", " \n", " \n", - " 46\n", - " InstantiateClass\n", - " 2430595\n", - " 426.324732\n", - " 10.523960\n", - " 7.207519\n", - " \n", - " \n", - " 147\n", + " 149\n", " Total Frontend\n", - " 1251\n", - " 384.832270\n", - " 18457.183201\n", - " 6.506040\n", + " 1276\n", + " 348.010786\n", + " 16364.143560\n", + " 5.017694\n", " \n", " \n", - " 32\n", + " 33\n", " Frontend\n", - " 2419\n", - " 384.832095\n", - " 9545.235926\n", - " 6.506037\n", + " 2462\n", + " 348.010603\n", + " 8481.168229\n", + " 5.017691\n", " \n", " \n", - " 241\n", + " 47\n", + " InstantiateClass\n", + " 1969340\n", + " 337.831372\n", + " 10.292729\n", + " 4.870925\n", + " \n", + " \n", + " 243\n", " Total Source\n", - " 1251\n", - " 232.585137\n", - " 11155.162444\n", - " 3.932124\n", + " 1276\n", + " 220.549562\n", + " 10370.669049\n", + " 3.179931\n", " \n", " \n", - " 168\n", + " 170\n", " Total InstantiateFunction\n", - " 1251\n", - " 187.199754\n", - " 8978.405449\n", - " 3.164831\n", + " 1276\n", + " 154.873240\n", + " 7282.440762\n", + " 2.232995\n", " \n", " \n", - " 75\n", + " 78\n", " PerformPendingInstantiations\n", - " 1251\n", - " 109.696772\n", - " 5261.236084\n", - " 1.854552\n", + " 1276\n", + " 94.353463\n", + " 4436.683223\n", + " 1.360408\n", " \n", " \n", - " 217\n", + " 219\n", " Total PerformPendingInstantiations\n", - " 1251\n", - " 109.696762\n", - " 5261.235595\n", - " 1.854552\n", + " 1276\n", + " 94.353453\n", + " 4436.682719\n", + " 1.360408\n", " \n", " \n", - " 167\n", + " 169\n", " Total InstantiateClass\n", - " 1251\n", - " 101.970700\n", - " 4890.681049\n", - " 1.723934\n", - " \n", - " \n", - " 211\n", - " Total ParseClass\n", - " 1251\n", - " 65.367502\n", - " 3135.131968\n", - " 1.105114\n", - " \n", - " \n", - " 69\n", - " ParseClass\n", - " 615160\n", - " 60.689238\n", - " 5.919361\n", - " 1.026023\n", + " 1276\n", + " 84.004119\n", + " 3950.036931\n", + " 1.211189\n", " \n", " \n", " 4\n", " Backend\n", - " 1251\n", - " 57.471167\n", - " 2756.410890\n", - " 0.971617\n", + " 1276\n", + " 55.376770\n", + " 2603.923346\n", + " 0.798434\n", " \n", " \n", - " 104\n", + " 106\n", " Total Backend\n", - " 1251\n", - " 57.471157\n", - " 2756.410412\n", - " 0.971617\n", - " \n", - " \n", - " 70\n", - " ParseDeclarationOrFunctionDefinition\n", - " 377890\n", - " 50.231873\n", - " 7.975634\n", - " 0.849229\n", - " \n", - " \n", - " 212\n", - " Total ParseDeclarationOrFunctionDefinition\n", - " 1251\n", - " 44.632077\n", - " 2140.627196\n", - " 0.754557\n", - " \n", - " \n", - " 71\n", - " ParseFunctionDefinition\n", - " 410465\n", - " 38.754508\n", - " 5.664967\n", - " 0.655190\n", + " 1276\n", + " 55.376759\n", + " 2603.922835\n", + " 0.798434\n", " \n", " \n", " 213\n", - " Total ParseFunctionDefinition\n", - " 1251\n", - " 38.036852\n", - " 1824.309444\n", - " 0.643057\n", + " Total ParseClass\n", + " 1276\n", + " 52.732797\n", + " 2479.598586\n", + " 0.760313\n", " \n", " \n", - " 68\n", + " 72\n", + " ParseClass\n", + " 503019\n", + " 47.920679\n", + " 5.715969\n", + " 0.690931\n", + " \n", + " \n", + " 73\n", + " ParseDeclarationOrFunctionDefinition\n", + " 320116\n", + " 44.703333\n", + " 8.378838\n", + " 0.644542\n", + " \n", + " \n", + " 71\n", " Optimizer\n", - " 1251\n", - " 37.729725\n", - " 1809.579141\n", - " 0.637865\n", + " 1276\n", + " 36.732533\n", + " 1727.235099\n", + " 0.529617\n", + " \n", + " \n", + " 209\n", + " Total Optimizer\n", + " 1276\n", + " 36.732523\n", + " 1727.234601\n", + " 0.529617\n", + " \n", + " \n", + " 214\n", + " Total ParseDeclarationOrFunctionDefinition\n", + " 1276\n", + " 35.048983\n", + " 1648.071277\n", + " 0.505344\n", + " \n", + " \n", + " 215\n", + " Total ParseFunctionDefinition\n", + " 1276\n", + " 29.408540\n", + " 1382.846726\n", + " 0.424019\n", " \n", " \n", "\n", @@ -2929,48 +2897,48 @@ ], "text/plain": [ " event_type count total_min \\\n", - "47 InstantiateFunction 15006405 2069.145884 \n", - "30 ExecuteCompiler 1251 445.214199 \n", - "143 Total ExecuteCompiler 1251 445.214188 \n", - "46 InstantiateClass 2430595 426.324732 \n", - "147 Total Frontend 1251 384.832270 \n", - "32 Frontend 2419 384.832095 \n", - "241 Total Source 1251 232.585137 \n", - "168 Total InstantiateFunction 1251 187.199754 \n", - "75 PerformPendingInstantiations 1251 109.696772 \n", - "217 Total PerformPendingInstantiations 1251 109.696762 \n", - "167 Total InstantiateClass 1251 101.970700 \n", - "211 Total ParseClass 1251 65.367502 \n", - "69 ParseClass 615160 60.689238 \n", - "4 Backend 1251 57.471167 \n", - "104 Total Backend 1251 57.471157 \n", - "70 ParseDeclarationOrFunctionDefinition 377890 50.231873 \n", - "212 Total ParseDeclarationOrFunctionDefinition 1251 44.632077 \n", - "71 ParseFunctionDefinition 410465 38.754508 \n", - "213 Total ParseFunctionDefinition 1251 38.036852 \n", - "68 Optimizer 1251 37.729725 \n", + "48 InstantiateFunction 13837259 1669.984159 \n", + "31 ExecuteCompiler 1276 405.041858 \n", + "145 Total ExecuteCompiler 1276 405.041848 \n", + "149 Total Frontend 1276 348.010786 \n", + "33 Frontend 2462 348.010603 \n", + "47 InstantiateClass 1969340 337.831372 \n", + "243 Total Source 1276 220.549562 \n", + "170 Total InstantiateFunction 1276 154.873240 \n", + "78 PerformPendingInstantiations 1276 94.353463 \n", + "219 Total PerformPendingInstantiations 1276 94.353453 \n", + "169 Total InstantiateClass 1276 84.004119 \n", + "4 Backend 1276 55.376770 \n", + "106 Total Backend 1276 55.376759 \n", + "213 Total ParseClass 1276 52.732797 \n", + "72 ParseClass 503019 47.920679 \n", + "73 ParseDeclarationOrFunctionDefinition 320116 44.703333 \n", + "71 Optimizer 1276 36.732533 \n", + "209 Total Optimizer 1276 36.732523 \n", + "214 Total ParseDeclarationOrFunctionDefinition 1276 35.048983 \n", + "215 Total ParseFunctionDefinition 1276 29.408540 \n", "\n", " mean_ms pct_total \n", - "47 8.273051 34.981333 \n", - "30 21353.198979 7.526867 \n", - "143 21353.198487 7.526867 \n", - "46 10.523960 7.207519 \n", - "147 18457.183201 6.506040 \n", - "32 9545.235926 6.506037 \n", - "241 11155.162444 3.932124 \n", - "168 8978.405449 3.164831 \n", - "75 5261.236084 1.854552 \n", - "217 5261.235595 1.854552 \n", - "167 4890.681049 1.723934 \n", - "211 3135.131968 1.105114 \n", - "69 5.919361 1.026023 \n", - "4 2756.410890 0.971617 \n", - "104 2756.410412 0.971617 \n", - "70 7.975634 0.849229 \n", - "212 2140.627196 0.754557 \n", - "71 5.664967 0.655190 \n", - "213 1824.309444 0.643057 \n", - "68 1809.579141 0.637865 " + "48 7.241250 24.078188 \n", + "31 19045.855415 5.839980 \n", + "145 19045.854932 5.839980 \n", + "149 16364.143560 5.017694 \n", + "33 8481.168229 5.017691 \n", + "47 10.292729 4.870925 \n", + "243 10370.669049 3.179931 \n", + "170 7282.440762 2.232995 \n", + "78 4436.683223 1.360408 \n", + "219 4436.682719 1.360408 \n", + "169 3950.036931 1.211189 \n", + "4 2603.923346 0.798434 \n", + "106 2603.922835 0.798434 \n", + "213 2479.598586 0.760313 \n", + "72 5.715969 0.690931 \n", + "73 8.378838 0.644542 \n", + "71 1727.235099 0.529617 \n", + "209 1727.234601 0.529617 \n", + "214 1648.071277 0.505344 \n", + "215 1382.846726 0.424019 " ] }, "metadata": {}, @@ -3026,9 +2994,9 @@ "output_type": "stream", "text": [ "Found ninja log: ../../../build-trace/.ninja_log\n", - "Parsed 2,579 build events in 0.004s\n", + "Parsed 2,536 build events in 0.003s\n", "\n", - "Builds DataFrame: 2,579 rows\n" + "Builds DataFrame: 2,536 rows\n" ] }, { @@ -3082,29 +3050,29 @@ " \n", " 2\n", " library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi...\n", - " 228\n", - " 9471\n", + " 354\n", + " 9957\n", " b37d14fd75e2d29c\n", " -1\n", - " 9243\n", + " 9603\n", " \n", " \n", " 3\n", " _deps/gtest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o\n", - " 103\n", - " 9527\n", + " 225\n", + " 9931\n", " 330bec9abe978cde\n", " -1\n", - " 9424\n", + " 9706\n", " \n", " \n", " 4\n", " library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeFiles/devi...\n", - " 243\n", - " 9687\n", + " 365\n", + " 229598\n", " 6a5b0a35a99473e8\n", " -1\n", - " 9444\n", + " 229233\n", " \n", " \n", "\n", @@ -3121,9 +3089,9 @@ " start_ms end_ms cmd_hash worker_id duration_ms \n", "0 487 511 20b4a9dba5446149 -1 24 \n", "1 512 159966 80f2b673d9d1b995 -1 159454 \n", - "2 228 9471 b37d14fd75e2d29c -1 9243 \n", - "3 103 9527 330bec9abe978cde -1 9424 \n", - "4 243 9687 6a5b0a35a99473e8 -1 9444 " + "2 354 9957 b37d14fd75e2d29c -1 9603 \n", + "3 225 9931 330bec9abe978cde -1 9706 \n", + "4 365 229598 6a5b0a35a99473e8 -1 229233 " ] }, "metadata": {}, diff --git a/script/analyze_build/trace_analysis/__init__.py b/script/analyze_build/trace_analysis/__init__.py index 22fc3e0b5f..cb43359d71 100644 --- a/script/analyze_build/trace_analysis/__init__.py +++ b/script/analyze_build/trace_analysis/__init__.py @@ -71,6 +71,7 @@ from .parser import TraceParser from .transformer import TraceTransformer from .template_parser import TemplateParser from .ninja_parser import NinjaLogParser +from .chrome_trace import ChromeTraceExporter from .utils import find_trace_files __all__ = [ @@ -87,6 +88,8 @@ __all__ = [ "NinjaLogParser", "NinjaBuild", "CompilationTimeline", + # Chrome Trace export + "ChromeTraceExporter", # Metadata and statistics "FileMetadata", "BuildStatistics", diff --git a/script/analyze_build/trace_analysis/chrome_trace.py b/script/analyze_build/trace_analysis/chrome_trace.py new file mode 100644 index 0000000000..5e6a797314 --- /dev/null +++ b/script/analyze_build/trace_analysis/chrome_trace.py @@ -0,0 +1,133 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Chrome Trace Event Format export for Perfetto visualization. + +Exports ninja build timeline data to Chrome Trace Event Format +for visualization in Perfetto UI within Jupyter notebooks. +""" + +from pathlib import Path +from typing import Dict, Any +import pandas as pd + + +class ChromeTraceExporter: + """Export trace analysis data to Chrome Trace Event Format.""" + + @staticmethod + def categorize_target(target: str) -> str: + """ + Categorize a build target based on file extension. + + Args: + target: Build target name (e.g., "obj/foo.o") + + Returns: + Category string for Chrome Trace format + """ + ext = Path(target).suffix.lower() + + if ext in [".o", ".obj"]: + return "compile" + elif ext in [".a", ".lib"]: + return "archive" + elif ext in [".so", ".dll", ".dylib"]: + return "link_shared" + elif ext in [".exe", ".out"]: + return "link_executable" + elif "test" in target.lower(): + return "test" + else: + return "other" + + @staticmethod + def export_ninja_timeline( + builds_df: pd.DataFrame, process_id: int = 1, include_metadata: bool = True + ) -> Dict[str, Any]: + """ + Export ninja build timeline to Chrome Trace format. + + Creates trace events compatible with Perfetto UI for visualization + in Jupyter notebooks or chrome://tracing. + + Args: + builds_df: DataFrame with columns: target, start_ms, end_ms, + duration_ms, worker_id, cmd_hash + process_id: Process ID for trace events (default: 1) + include_metadata: Include trace metadata (default: True) + + Returns: + Dictionary in Chrome Trace Event Format: + { + 'traceEvents': [...], + 'displayTimeUnit': 'ms', + 'otherData': {...} + } + + Example: + >>> from trace_analysis import NinjaLogParser, ChromeTraceExporter + >>> builds = NinjaLogParser.parse(Path('build/.ninja_log')) + >>> builds_df = NinjaLogParser.to_dataframe(builds) + >>> builds_df = NinjaLogParser.assign_workers(builds_df) + >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + >>> # Display in notebook or save to file + """ + if len(builds_df) == 0: + return { + "traceEvents": [], + "displayTimeUnit": "ms", + "otherData": {"version": "1.0", "generator": "analyze_build"}, + } + + events = [] + + for _, row in builds_df.iterrows(): + # Categorize based on file extension + category = ChromeTraceExporter.categorize_target(row["target"]) + + # Create Chrome Trace event + event = { + "name": row["target"], + "cat": category, + "ph": "X", # Complete event (has duration) + "ts": int(row["start_ms"] * 1000), # Convert to microseconds + "dur": int(row["duration_ms"] * 1000), # Convert to microseconds + "pid": process_id, + "tid": int(row["worker_id"]), + "args": { + "output": row["target"], + "duration_ms": int(row["duration_ms"]), + "cmd_hash": row["cmd_hash"], + }, + } + + events.append(event) + + if include_metadata: + return { + "traceEvents": events, + "displayTimeUnit": "ms", + "otherData": {"version": "1.0", "generator": "analyze_build"}, + } + else: + # Simple format (just events array) + return {"traceEvents": events} + + @staticmethod + def export_to_file(trace_data: Dict[str, Any], output_path: str) -> None: + """ + Export trace data to a JSON file. + + Args: + trace_data: Chrome Trace format dictionary + output_path: Path to output file + + Example: + >>> ChromeTraceExporter.export_to_file(trace_data, 'trace.json') + """ + import json + + with open(output_path, "w") as f: + json.dump(trace_data, f, indent=2) diff --git a/script/analyze_build/trace_analysis/perfetto_display.py b/script/analyze_build/trace_analysis/perfetto_display.py new file mode 100644 index 0000000000..910c2a8064 --- /dev/null +++ b/script/analyze_build/trace_analysis/perfetto_display.py @@ -0,0 +1,190 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +""" +Perfetto UI display utilities for Jupyter notebooks. + +Provides functions to display Chrome Trace data in Perfetto UI +directly within Jupyter notebooks. +""" + +import json +import base64 +from typing import Dict, Any, Optional + + +def display_perfetto(trace_data: Dict[str, Any], height: int = 600): + """ + Display Perfetto UI in Jupyter notebook with embedded trace data. + + Args: + trace_data: Chrome Trace Event Format dictionary + height: Height of the IFrame in pixels (default: 600) + + Returns: + IPython IFrame object for display in notebook + + Example: + >>> from trace_analysis import NinjaLogParser, ChromeTraceExporter + >>> from trace_analysis.perfetto_display import display_perfetto + >>> builds_df = NinjaLogParser.to_dataframe(builds) + >>> trace_data = ChromeTraceExporter.export_ninja_timeline(builds_df) + >>> display_perfetto(trace_data) + + Note: + This function requires IPython to be installed (available in Jupyter). + The trace data is base64-encoded and embedded in the Perfetto UI URL. + For very large traces (>10MB), consider using save_and_link() instead. + """ + try: + from IPython.display import IFrame + except ImportError: + raise ImportError( + "IPython is required for display_perfetto(). " + "Install it with: pip install ipython" + ) + + # Convert trace to JSON string + trace_json = json.dumps(trace_data) + + # Base64 encode for URL + trace_b64 = base64.b64encode(trace_json.encode()).decode() + + # Perfetto UI URL with embedded trace + perfetto_url = f"https://ui.perfetto.dev/#!/?s={trace_b64}" + + # Display in IFrame + return IFrame(perfetto_url, width="100%", height=height) + + +def save_and_link( + trace_data: Dict[str, Any], output_path: str, link_text: Optional[str] = None +): + """ + Save trace to file and display a link to open in Perfetto UI. + + This is useful for large traces that are too big to embed in a URL. + + Args: + trace_data: Chrome Trace Event Format dictionary + output_path: Path to save the trace file + link_text: Custom link text (default: "Open trace in Perfetto UI") + + Returns: + IPython HTML object with download link and instructions + + Example: + >>> save_and_link(trace_data, '../data/build_trace.json') + + Note: + The user will need to manually upload the saved file to + https://ui.perfetto.dev + """ + try: + from IPython.display import HTML + except ImportError: + raise ImportError( + "IPython is required for save_and_link(). " + "Install it with: pip install ipython" + ) + + # Save trace to file + with open(output_path, "w") as f: + json.dump(trace_data, f, indent=2) + + if link_text is None: + link_text = "Open trace in Perfetto UI" + + # Create HTML with instructions + html = f""" +
+

Trace saved to: {output_path}

+

To view in Perfetto UI:

+
    +
  1. Go to {link_text}
  2. +
  3. Click "Open trace file" and select: {output_path}
  4. +
+

Or drag and drop the file directly into the Perfetto UI.

+
+ """ + + return HTML(html) + + +def get_trace_summary(trace_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Get summary statistics from trace data. + + Args: + trace_data: Chrome Trace Event Format dictionary + + Returns: + Dictionary with summary statistics + + Example: + >>> summary = get_trace_summary(trace_data) + >>> print(f"Total events: {summary['event_count']}") + >>> print(f"Total time: {summary['total_duration_s']:.2f}s") + """ + events = trace_data.get("traceEvents", []) + + if not events: + return { + "event_count": 0, + "total_duration_s": 0.0, + "categories": {}, + "worker_count": 0, + } + + # Count events by category + categories = {} + total_duration_us = 0 + worker_ids = set() + + for event in events: + cat = event.get("cat", "unknown") + categories[cat] = categories.get(cat, 0) + 1 + + dur = event.get("dur", 0) + total_duration_us += dur + + tid = event.get("tid") + if tid is not None: + worker_ids.add(tid) + + return { + "event_count": len(events), + "total_duration_s": total_duration_us / 1e6, + "categories": categories, + "worker_count": len(worker_ids), + } + + +def print_trace_summary(trace_data: Dict[str, Any]) -> None: + """ + Print a formatted summary of trace data. + + Args: + trace_data: Chrome Trace Event Format dictionary + + Example: + >>> print_trace_summary(trace_data) + === Trace Summary === + Total events: 1,234 + Total duration: 123.45s + Workers: 8 + ... + """ + summary = get_trace_summary(trace_data) + + print("=== Trace Summary ===") + print(f"Total events: {summary['event_count']:,}") + print(f"Total duration: {summary['total_duration_s']:.2f}s") + print(f"Workers: {summary['worker_count']}") + + if summary["categories"]: + print("\nEvents by category:") + for cat, count in sorted( + summary["categories"].items(), key=lambda x: x[1], reverse=True + ): + print(f" {cat:15} {count:6,} events")