feat: add statistical significance to perf report with z-score thresholds (#9305)

## Summary

Replace fixed 10%/20% perf delta thresholds with dynamic σ-based
classification using z-scores, eliminating false alarms from naturally
noisy duration metrics (10-17% CV).

## Changes

- **What**:
- Run each perf test 3× (`--repeat-each=3`) and report the mean,
reducing single-run noise
- Download last 5 successful main branch perf artifacts to compute
historical μ/σ per metric
- Replace fixed threshold flags with z-score significance: `⚠️
regression` (z>2), ` neutral/improvement`, `🔇 noisy` (CV>50%)
  - Add collapsible historical variance table (μ, σ, CV) to PR comment
- Graceful cold start: falls back to simple delta table until ≥2
historical runs exist
- New `scripts/perf-stats.ts` module with `computeStats`, `zScore`,
`classifyChange`
  - 18 unit tests for stats functions

- **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min,
historical download <10s)

## Review Focus

- The `gh api` call in the new "Download historical perf baselines"
step: it queries the last 5 successful push runs on the base branch. The
`gh` CLI is available natively on `ubuntu-latest` runners and
auto-authenticates with `GITHUB_TOKEN`.
- `getHistoricalStats` averages per-run measurements before computing
cross-run σ — this is intentional since historical artifacts may also
contain repeated measurements after this change lands.
- The `noisy` classification (CV>50%) suppresses metrics like `layouts`
that hover near 0 and have meaningless percentage swings.

┆Issue is synchronized with this [Notion
page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1)
by [Unito](https://www.unito.io)
This commit is contained in:
Christian Byrne
2026-03-04 16:16:53 -08:00
committed by GitHub
parent b8edb11ac1
commit 7b316eb9a2
5 changed files with 471 additions and 62 deletions

View File

@@ -1,4 +1,14 @@
import { existsSync, readFileSync } from 'node:fs'
import { existsSync, readFileSync, readdirSync } from 'node:fs'
import { join } from 'node:path'
import type { MetricStats } from './perf-stats'
import {
classifyChange,
computeStats,
formatSignificance,
isNoteworthy,
zScore
} from './perf-stats'
interface PerfMeasurement {
name: string
@@ -20,12 +30,76 @@ interface PerfReport {
const CURRENT_PATH = 'test-results/perf-metrics.json'
const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
const HISTORY_DIR = 'temp/perf-history'
function formatDelta(pct: number): string {
if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
return `${pct.toFixed(0)}% 🟢`
type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
{ key: 'layouts', label: 'layouts', unit: '' },
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
]
function groupByName(
measurements: PerfMeasurement[]
): Map<string, PerfMeasurement[]> {
const map = new Map<string, PerfMeasurement[]>()
for (const m of measurements) {
const list = map.get(m.name) ?? []
list.push(m)
map.set(m.name, list)
}
return map
}
function loadHistoricalReports(): PerfReport[] {
if (!existsSync(HISTORY_DIR)) return []
const reports: PerfReport[] = []
for (const dir of readdirSync(HISTORY_DIR)) {
const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
if (!existsSync(filePath)) continue
try {
reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
} catch {
console.warn(`Skipping malformed perf history: ${filePath}`)
}
}
return reports
}
function getHistoricalStats(
reports: PerfReport[],
testName: string,
metric: MetricKey
): MetricStats {
const values: number[] = []
for (const r of reports) {
const group = groupByName(r.measurements)
const samples = group.get(testName)
if (samples) {
const mean =
samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
values.push(mean)
}
}
return computeStats(values)
}
function computeCV(stats: MetricStats): number {
return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
}
function formatValue(value: number, unit: string): string {
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
}
function formatDelta(pct: number | null): string {
if (pct === null) return '—'
const sign = pct >= 0 ? '+' : ''
return `${sign}${pct.toFixed(0)}%`
}
function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
}
function formatBytes(bytes: number): string {
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
}
function calcDelta(
baseline: number,
current: number
): { pct: number; isNew: boolean } {
if (baseline > 0) {
return { pct: ((current - baseline) / baseline) * 100, isNew: false }
function renderFullReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historical: PerfReport[]
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
const tableHeader = [
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
'|--------|----------|----------|---|-----|'
]
const flaggedRows: string[] = []
const allRows: string[] = []
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
const histStats = getHistoricalStats(historical, testName, key)
const cv = computeCV(histStats)
if (!baseSamples?.length) {
allRows.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
const z = zScore(prMean, histStats)
const sig = classifyChange(z, cv)
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
allRows.push(row)
if (isNoteworthy(sig)) {
flaggedRows.push(row)
}
}
}
return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
if (flaggedRows.length > 0) {
lines.push(
`⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
'',
...tableHeader,
...flaggedRows,
''
)
} else {
lines.push('No regressions detected.', '')
}
lines.push(
`<details><summary>All metrics</summary>`,
'',
...tableHeader,
...allRows,
'',
'</details>',
''
)
lines.push(
`<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
'',
'| Metric | μ | σ | CV |',
'|--------|---|---|-----|'
)
for (const [testName] of prGroups) {
for (const { key, label, unit } of REPORTED_METRICS) {
const stats = getHistoricalStats(historical, testName, key)
if (stats.n < 2) continue
const cv = computeCV(stats)
lines.push(
`| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
)
}
}
lines.push('', '</details>')
return lines
}
function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
function renderColdStartReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historicalCount: number
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
lines.push(
`> Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
'',
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
)
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
if (!baseSamples?.length) {
lines.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
lines.push(
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
)
}
}
return lines
}
function renderNoBaselineReport(
prGroups: Map<string, PerfMeasurement[]>
): string[] {
const lines: string[] = []
lines.push(
'No baseline found — showing absolute values.\n',
'| Metric | Value |',
'|--------|-------|'
)
for (const [testName, prSamples] of prGroups) {
const prMean = (key: MetricKey) =>
prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
lines.push(
`| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
)
lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
lines.push(
`| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
)
const heapMean =
prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
}
return lines
}
function main() {
@@ -62,55 +285,18 @@ function main() {
? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
: null
const historical = loadHistoricalReports()
const prGroups = groupByName(current.measurements)
const lines: string[] = []
lines.push('## ⚡ Performance Report\n')
if (baseline) {
lines.push(
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
)
for (const m of current.measurements) {
const base = baseline.measurements.find((b) => b.name === m.name)
if (!base) {
lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
lines.push(
`| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
)
continue
}
const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
lines.push(
`| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
)
const layoutDelta = calcDelta(base.layouts, m.layouts)
lines.push(
`| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
)
const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
lines.push(
`| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
)
}
if (baseline && historical.length >= 2) {
lines.push(...renderFullReport(prGroups, baseline, historical))
} else if (baseline) {
lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
} else {
lines.push(
'No baseline found — showing absolute values.\n',
'| Metric | Value |',
'|--------|-------|'
)
for (const m of current.measurements) {
lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
lines.push(
`| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
)
lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
}
lines.push(...renderNoBaselineReport(prGroups))
}
lines.push('\n<details><summary>Raw data</summary>\n')