Files
John Shumway 270b1445b1 [rocm-libraries] ROCm/rocm-libraries#4259 (commit 223d90c)
Add multi-file trace parsing and analysis pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends build time analysis from ROCm/composable_kernel#3644 to handle
multiple trace files across build directories (see #4229):

- pipeline.py: Generic pipeline framework with fluent interface for
composable data processing. Provides parallel processing, progress
tracking, and error handling independent of trace-specific code.
Processes thousands of trace files at default resolution in minutes,
aggregating results into in-memory DataFrames for analysis.
- parse_build.py: Parse all trace files in a build directory
- build_analysis_example.ipynb: Demonstrates pipeline aggregation across
all build files

The pipeline design improves capability (composable operations),
performance (parallel processing), and user-friendliness (fluent API) of
the analysis modules. It enables analyzing compilation patterns across
the entire codebase with all trace data available in pandas DataFrames
for interactive exploration.
2026-02-17 21:14:11 +00:00

97 lines
3.2 KiB
Python

# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
"""
Utility functions for trace analysis.
Helper functions for file discovery, path handling, and other common operations.
"""
import subprocess
import pandas as pd
from pathlib import Path
from typing import List
def find_trace_files(trace_dir: Path) -> List[Path]:
"""
Find all JSON trace files in a directory.
Uses Unix 'find' command when available (2-5x faster than Python),
with automatic fallback to Python's rglob for cross-platform compatibility.
Args:
trace_dir: Directory to search for trace files
Returns:
List of Path objects pointing to .json files
Example:
>>> from pathlib import Path
>>> from trace_analysis import find_trace_files
>>> trace_files = find_trace_files(Path("build/CMakeFiles"))
>>> print(f"Found {len(trace_files)} trace files")
"""
try:
# Try Unix find (2-5x faster than Python)
result = subprocess.run(
["find", str(trace_dir), "-name", "*.cpp.json", "-type", "f"],
capture_output=True,
text=True,
timeout=30,
check=True,
)
json_files = [Path(p) for p in result.stdout.strip().split("\n") if p]
except (subprocess.SubprocessError, FileNotFoundError, OSError):
# Fallback to Python (cross-platform)
print("Using Python to find trace files (this may be slower)...")
json_files = list(trace_dir.rglob("*.cpp.json"))
return json_files
def read_trace_files(json_files: List[Path], workers: int = -1) -> List["pd.DataFrame"]:
"""
Parse trace files in parallel and return list of DataFrames.
This is a convenience function that uses the Pipeline API to parse
multiple trace files in parallel with progress tracking.
Args:
json_files: List of paths to trace JSON files
workers: Number of parallel workers to use:
- -1: Use all available CPUs (default)
- None: Sequential processing (single-threaded)
- N > 0: Use N worker processes
Returns:
List of parsed DataFrames, one per file
Example:
>>> from pathlib import Path
>>> from trace_analysis import find_trace_files, read_trace_files
>>>
>>> # Find and parse all trace files
>>> trace_files = find_trace_files(Path("build/CMakeFiles"))
>>> dataframes = read_trace_files(trace_files, workers=8)
>>> print(f"Parsed {len(dataframes)} files")
>>>
>>> # Use Pipeline directly for more control
>>> from trace_analysis import Pipeline
>>> from trace_analysis.parse_file import parse_file
>>>
>>> pipeline = Pipeline(trace_files).map(parse_file, workers=8)
>>> all_events, metadata = pipeline.tee(
... lambda dfs: pd.concat(dfs, ignore_index=True),
... lambda dfs: [get_metadata(df) for df in dfs]
... )
"""
from trace_analysis.pipeline import Pipeline
from trace_analysis.parse_file import parse_file
return (
Pipeline(json_files)
.map(parse_file, workers=workers, desc="Parsing trace files")
.collect()
)