Files
ComfyUI_frontend/scripts/perf-report.ts
Christian Byrne 7b316eb9a2 feat: add statistical significance to perf report with z-score thresholds (#9305)
## Summary

Replace fixed 10%/20% perf delta thresholds with dynamic σ-based
classification using z-scores, eliminating false alarms from naturally
noisy duration metrics (10-17% CV).

## Changes

- **What**:
- Run each perf test 3× (`--repeat-each=3`) and report the mean,
reducing single-run noise
- Download last 5 successful main branch perf artifacts to compute
historical μ/σ per metric
- Replace fixed threshold flags with z-score significance: `⚠️
regression` (z>2), ` neutral/improvement`, `🔇 noisy` (CV>50%)
  - Add collapsible historical variance table (μ, σ, CV) to PR comment
- Graceful cold start: falls back to simple delta table until ≥2
historical runs exist
- New `scripts/perf-stats.ts` module with `computeStats`, `zScore`,
`classifyChange`
  - 18 unit tests for stats functions

- **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min,
historical download <10s)

## Review Focus

- The `gh api` call in the new "Download historical perf baselines"
step: it queries the last 5 successful push runs on the base branch. The
`gh` CLI is available natively on `ubuntu-latest` runners and
auto-authenticates with `GITHUB_TOKEN`.
- `getHistoricalStats` averages per-run measurements before computing
cross-run σ — this is intentional since historical artifacts may also
contain repeated measurements after this change lands.
- The `noisy` classification (CV>50%) suppresses metrics like `layouts`
that hover near 0 and have meaningless percentage swings.

┆Issue is synchronized with this [Notion
page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1)
by [Unito](https://www.unito.io)
2026-03-04 16:16:53 -08:00

312 lines
8.7 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { existsSync, readFileSync, readdirSync } from 'node:fs'
import { join } from 'node:path'
import type { MetricStats } from './perf-stats'
import {
classifyChange,
computeStats,
formatSignificance,
isNoteworthy,
zScore
} from './perf-stats'
interface PerfMeasurement {
name: string
durationMs: number
styleRecalcs: number
styleRecalcDurationMs: number
layouts: number
layoutDurationMs: number
taskDurationMs: number
heapDeltaBytes: number
}
interface PerfReport {
timestamp: string
gitSha: string
branch: string
measurements: PerfMeasurement[]
}
const CURRENT_PATH = 'test-results/perf-metrics.json'
const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
const HISTORY_DIR = 'temp/perf-history'
type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
{ key: 'layouts', label: 'layouts', unit: '' },
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
]
function groupByName(
measurements: PerfMeasurement[]
): Map<string, PerfMeasurement[]> {
const map = new Map<string, PerfMeasurement[]>()
for (const m of measurements) {
const list = map.get(m.name) ?? []
list.push(m)
map.set(m.name, list)
}
return map
}
function loadHistoricalReports(): PerfReport[] {
if (!existsSync(HISTORY_DIR)) return []
const reports: PerfReport[] = []
for (const dir of readdirSync(HISTORY_DIR)) {
const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
if (!existsSync(filePath)) continue
try {
reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
} catch {
console.warn(`Skipping malformed perf history: ${filePath}`)
}
}
return reports
}
function getHistoricalStats(
reports: PerfReport[],
testName: string,
metric: MetricKey
): MetricStats {
const values: number[] = []
for (const r of reports) {
const group = groupByName(r.measurements)
const samples = group.get(testName)
if (samples) {
const mean =
samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
values.push(mean)
}
}
return computeStats(values)
}
function computeCV(stats: MetricStats): number {
return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
}
function formatValue(value: number, unit: string): string {
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
}
function formatDelta(pct: number | null): string {
if (pct === null) return '—'
const sign = pct >= 0 ? '+' : ''
return `${sign}${pct.toFixed(0)}%`
}
function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
}
function formatBytes(bytes: number): string {
if (Math.abs(bytes) < 1024) return `${bytes} B`
if (Math.abs(bytes) < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
}
function renderFullReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historical: PerfReport[]
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
const tableHeader = [
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
'|--------|----------|----------|---|-----|'
]
const flaggedRows: string[] = []
const allRows: string[] = []
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
const histStats = getHistoricalStats(historical, testName, key)
const cv = computeCV(histStats)
if (!baseSamples?.length) {
allRows.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
const z = zScore(prMean, histStats)
const sig = classifyChange(z, cv)
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
allRows.push(row)
if (isNoteworthy(sig)) {
flaggedRows.push(row)
}
}
}
if (flaggedRows.length > 0) {
lines.push(
`⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
'',
...tableHeader,
...flaggedRows,
''
)
} else {
lines.push('No regressions detected.', '')
}
lines.push(
`<details><summary>All metrics</summary>`,
'',
...tableHeader,
...allRows,
'',
'</details>',
''
)
lines.push(
`<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
'',
'| Metric | μ | σ | CV |',
'|--------|---|---|-----|'
)
for (const [testName] of prGroups) {
for (const { key, label, unit } of REPORTED_METRICS) {
const stats = getHistoricalStats(historical, testName, key)
if (stats.n < 2) continue
const cv = computeCV(stats)
lines.push(
`| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
)
}
}
lines.push('', '</details>')
return lines
}
function renderColdStartReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historicalCount: number
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
lines.push(
`> Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
'',
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
)
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
if (!baseSamples?.length) {
lines.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
lines.push(
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
)
}
}
return lines
}
function renderNoBaselineReport(
prGroups: Map<string, PerfMeasurement[]>
): string[] {
const lines: string[] = []
lines.push(
'No baseline found — showing absolute values.\n',
'| Metric | Value |',
'|--------|-------|'
)
for (const [testName, prSamples] of prGroups) {
const prMean = (key: MetricKey) =>
prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
lines.push(
`| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
)
lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
lines.push(
`| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
)
const heapMean =
prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
}
return lines
}
function main() {
if (!existsSync(CURRENT_PATH)) {
process.stdout.write(
'## ⚡ Performance Report\n\nNo perf metrics found. Perf tests may not have run.\n'
)
process.exit(0)
}
const current: PerfReport = JSON.parse(readFileSync(CURRENT_PATH, 'utf-8'))
const baseline: PerfReport | null = existsSync(BASELINE_PATH)
? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
: null
const historical = loadHistoricalReports()
const prGroups = groupByName(current.measurements)
const lines: string[] = []
lines.push('## ⚡ Performance Report\n')
if (baseline && historical.length >= 2) {
lines.push(...renderFullReport(prGroups, baseline, historical))
} else if (baseline) {
lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
} else {
lines.push(...renderNoBaselineReport(prGroups))
}
lines.push('\n<details><summary>Raw data</summary>\n')
lines.push('```json')
lines.push(JSON.stringify(current, null, 2))
lines.push('```')
lines.push('\n</details>')
process.stdout.write(lines.join('\n') + '\n')
}
main()