feat: add statistical significance to perf report with z-score thresholds (#9305)

## Summary

Replace fixed 10%/20% perf delta thresholds with dynamic σ-based
classification using z-scores, eliminating false alarms from naturally
noisy duration metrics (10-17% CV).

## Changes

- **What**:
- Run each perf test 3× (`--repeat-each=3`) and report the mean,
reducing single-run noise
- Download last 5 successful main branch perf artifacts to compute
historical μ/σ per metric
- Replace fixed threshold flags with z-score significance: `⚠️
regression` (z>2), ` neutral/improvement`, `🔇 noisy` (CV>50%)
  - Add collapsible historical variance table (μ, σ, CV) to PR comment
- Graceful cold start: falls back to simple delta table until ≥2
historical runs exist
- New `scripts/perf-stats.ts` module with `computeStats`, `zScore`,
`classifyChange`
  - 18 unit tests for stats functions

- **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min,
historical download <10s)

## Review Focus

- The `gh api` call in the new "Download historical perf baselines"
step: it queries the last 5 successful push runs on the base branch. The
`gh` CLI is available natively on `ubuntu-latest` runners and
auto-authenticates with `GITHUB_TOKEN`.
- `getHistoricalStats` averages per-run measurements before computing
cross-run σ — this is intentional since historical artifacts may also
contain repeated measurements after this change lands.
- The `noisy` classification (CV>50%) suppresses metrics like `layouts`
that hover near 0 and have meaningless percentage swings.

┆Issue is synchronized with this [Notion
page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1)
by [Unito](https://www.unito.io)
This commit is contained in:
Christian Byrne
2026-03-04 16:16:53 -08:00
committed by GitHub
parent b8edb11ac1
commit 7b316eb9a2
5 changed files with 471 additions and 62 deletions

View File

@@ -45,7 +45,7 @@ jobs:
- name: Run performance tests
id: perf
continue-on-error: true
run: pnpm exec playwright test --project=performance --workers=1
run: pnpm exec playwright test --project=performance --workers=1 --repeat-each=3
- name: Upload perf metrics
if: always()
@@ -61,6 +61,7 @@ jobs:
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
pull-requests: write
@@ -90,6 +91,31 @@ jobs:
path: temp/perf-baseline/
if_no_artifact_found: warn
- name: Download historical perf baselines
continue-on-error: true
run: |
RUNS=$(gh api \
"/repos/${{ github.repository }}/actions/workflows/ci-perf-report.yaml/runs?branch=${{ github.event.pull_request.base.ref }}&event=push&status=success&per_page=5" \
--jq '.workflow_runs[].id' || true)
if [ -z "$RUNS" ]; then
echo "No historical runs available"
exit 0
fi
mkdir -p temp/perf-history
INDEX=0
for RUN_ID in $RUNS; do
DIR="temp/perf-history/$INDEX"
mkdir -p "$DIR"
gh run download "$RUN_ID" -n perf-metrics -D "$DIR/" 2>/dev/null || true
INDEX=$((INDEX + 1))
done
echo "Downloaded $(ls temp/perf-history/*/perf-metrics.json 2>/dev/null | wc -l) historical baselines"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Generate perf report
run: npx --yes tsx scripts/perf-report.ts > perf-report.md

View File

@@ -1,4 +1,14 @@
import { existsSync, readFileSync } from 'node:fs'
import { existsSync, readFileSync, readdirSync } from 'node:fs'
import { join } from 'node:path'
import type { MetricStats } from './perf-stats'
import {
classifyChange,
computeStats,
formatSignificance,
isNoteworthy,
zScore
} from './perf-stats'
interface PerfMeasurement {
name: string
@@ -20,12 +30,76 @@ interface PerfReport {
const CURRENT_PATH = 'test-results/perf-metrics.json'
const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
const HISTORY_DIR = 'temp/perf-history'
function formatDelta(pct: number): string {
if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
return `${pct.toFixed(0)}% 🟢`
type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
{ key: 'layouts', label: 'layouts', unit: '' },
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
]
function groupByName(
measurements: PerfMeasurement[]
): Map<string, PerfMeasurement[]> {
const map = new Map<string, PerfMeasurement[]>()
for (const m of measurements) {
const list = map.get(m.name) ?? []
list.push(m)
map.set(m.name, list)
}
return map
}
function loadHistoricalReports(): PerfReport[] {
if (!existsSync(HISTORY_DIR)) return []
const reports: PerfReport[] = []
for (const dir of readdirSync(HISTORY_DIR)) {
const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
if (!existsSync(filePath)) continue
try {
reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
} catch {
console.warn(`Skipping malformed perf history: ${filePath}`)
}
}
return reports
}
function getHistoricalStats(
reports: PerfReport[],
testName: string,
metric: MetricKey
): MetricStats {
const values: number[] = []
for (const r of reports) {
const group = groupByName(r.measurements)
const samples = group.get(testName)
if (samples) {
const mean =
samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
values.push(mean)
}
}
return computeStats(values)
}
function computeCV(stats: MetricStats): number {
return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
}
function formatValue(value: number, unit: string): string {
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
}
function formatDelta(pct: number | null): string {
if (pct === null) return '—'
const sign = pct >= 0 ? '+' : ''
return `${sign}${pct.toFixed(0)}%`
}
function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
}
function formatBytes(bytes: number): string {
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
}
function calcDelta(
baseline: number,
current: number
): { pct: number; isNew: boolean } {
if (baseline > 0) {
return { pct: ((current - baseline) / baseline) * 100, isNew: false }
function renderFullReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historical: PerfReport[]
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
const tableHeader = [
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
'|--------|----------|----------|---|-----|'
]
const flaggedRows: string[] = []
const allRows: string[] = []
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
const histStats = getHistoricalStats(historical, testName, key)
const cv = computeCV(histStats)
if (!baseSamples?.length) {
allRows.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
const z = zScore(prMean, histStats)
const sig = classifyChange(z, cv)
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
allRows.push(row)
if (isNoteworthy(sig)) {
flaggedRows.push(row)
}
}
}
return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
if (flaggedRows.length > 0) {
lines.push(
`⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
'',
...tableHeader,
...flaggedRows,
''
)
} else {
lines.push('No regressions detected.', '')
}
lines.push(
`<details><summary>All metrics</summary>`,
'',
...tableHeader,
...allRows,
'',
'</details>',
''
)
lines.push(
`<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
'',
'| Metric | μ | σ | CV |',
'|--------|---|---|-----|'
)
for (const [testName] of prGroups) {
for (const { key, label, unit } of REPORTED_METRICS) {
const stats = getHistoricalStats(historical, testName, key)
if (stats.n < 2) continue
const cv = computeCV(stats)
lines.push(
`| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
)
}
}
lines.push('', '</details>')
return lines
}
function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
function renderColdStartReport(
prGroups: Map<string, PerfMeasurement[]>,
baseline: PerfReport,
historicalCount: number
): string[] {
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
lines.push(
`> Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
'',
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
)
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prValues = prSamples.map((s) => s[key])
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
if (!baseSamples?.length) {
lines.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const deltaPct =
baseVal === 0
? prMean === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
lines.push(
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
)
}
}
return lines
}
function renderNoBaselineReport(
prGroups: Map<string, PerfMeasurement[]>
): string[] {
const lines: string[] = []
lines.push(
'No baseline found — showing absolute values.\n',
'| Metric | Value |',
'|--------|-------|'
)
for (const [testName, prSamples] of prGroups) {
const prMean = (key: MetricKey) =>
prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
lines.push(
`| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
)
lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
lines.push(
`| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
)
const heapMean =
prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
}
return lines
}
function main() {
@@ -62,55 +285,18 @@ function main() {
? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
: null
const historical = loadHistoricalReports()
const prGroups = groupByName(current.measurements)
const lines: string[] = []
lines.push('## ⚡ Performance Report\n')
if (baseline) {
lines.push(
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
)
for (const m of current.measurements) {
const base = baseline.measurements.find((b) => b.name === m.name)
if (!base) {
lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
lines.push(
`| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
)
continue
}
const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
lines.push(
`| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
)
const layoutDelta = calcDelta(base.layouts, m.layouts)
lines.push(
`| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
)
const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
lines.push(
`| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
)
}
if (baseline && historical.length >= 2) {
lines.push(...renderFullReport(prGroups, baseline, historical))
} else if (baseline) {
lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
} else {
lines.push(
'No baseline found — showing absolute values.\n',
'| Metric | Value |',
'|--------|-------|'
)
for (const m of current.measurements) {
lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
lines.push(
`| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
)
lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
}
lines.push(...renderNoBaselineReport(prGroups))
}
lines.push('\n<details><summary>Raw data</summary>\n')

133
scripts/perf-stats.test.ts Normal file
View File

@@ -0,0 +1,133 @@
import { describe, expect, it } from 'vitest'
import {
classifyChange,
computeStats,
formatSignificance,
isNoteworthy,
zScore
} from './perf-stats'
describe('computeStats', () => {
it('returns zeros for empty array', () => {
const stats = computeStats([])
expect(stats).toEqual({ mean: 0, stddev: 0, min: 0, max: 0, n: 0 })
})
it('returns value with zero stddev for single element', () => {
const stats = computeStats([42])
expect(stats).toEqual({ mean: 42, stddev: 0, min: 42, max: 42, n: 1 })
})
it('computes correct stats for known values', () => {
// Values: [2, 4, 4, 4, 5, 5, 7, 9]
// Mean = 5, sample variance ≈ 4.57, sample stddev ≈ 2.14
const stats = computeStats([2, 4, 4, 4, 5, 5, 7, 9])
expect(stats.mean).toBe(5)
expect(stats.stddev).toBeCloseTo(2.138, 2)
expect(stats.min).toBe(2)
expect(stats.max).toBe(9)
expect(stats.n).toBe(8)
})
it('uses sample stddev (n-1 denominator)', () => {
// [10, 20] → mean=15, variance=(25+25)/1=50, stddev≈7.07
const stats = computeStats([10, 20])
expect(stats.mean).toBe(15)
expect(stats.stddev).toBeCloseTo(7.071, 2)
expect(stats.n).toBe(2)
})
it('handles identical values', () => {
const stats = computeStats([5, 5, 5, 5])
expect(stats.mean).toBe(5)
expect(stats.stddev).toBe(0)
})
})
describe('zScore', () => {
it('returns null when stddev is 0', () => {
const stats = computeStats([5, 5, 5])
expect(zScore(10, stats)).toBeNull()
})
it('returns null when n < 2', () => {
const stats = computeStats([5])
expect(zScore(10, stats)).toBeNull()
})
it('computes correct z-score', () => {
const stats = { mean: 100, stddev: 10, min: 80, max: 120, n: 5 }
expect(zScore(120, stats)).toBe(2)
expect(zScore(80, stats)).toBe(-2)
expect(zScore(100, stats)).toBe(0)
})
})
describe('classifyChange', () => {
it('returns noisy when CV > 50%', () => {
expect(classifyChange(3, 60)).toBe('noisy')
expect(classifyChange(-3, 51)).toBe('noisy')
})
it('does not classify as noisy when CV is exactly 50%', () => {
expect(classifyChange(3, 50)).toBe('regression')
expect(classifyChange(-3, 50)).toBe('improvement')
})
it('returns neutral when z is null', () => {
expect(classifyChange(null, 10)).toBe('neutral')
})
it('returns regression when z > 2', () => {
expect(classifyChange(2.1, 10)).toBe('regression')
expect(classifyChange(5, 10)).toBe('regression')
})
it('returns improvement when z < -2', () => {
expect(classifyChange(-2.1, 10)).toBe('improvement')
expect(classifyChange(-5, 10)).toBe('improvement')
})
it('returns neutral when z is within [-2, 2]', () => {
expect(classifyChange(0, 10)).toBe('neutral')
expect(classifyChange(1.9, 10)).toBe('neutral')
expect(classifyChange(-1.9, 10)).toBe('neutral')
expect(classifyChange(2, 10)).toBe('neutral')
expect(classifyChange(-2, 10)).toBe('neutral')
})
})
describe('formatSignificance', () => {
it('formats regression with z-score and emoji', () => {
expect(formatSignificance('regression', 3.2)).toBe('⚠️ z=3.2')
})
it('formats improvement with z-score without emoji', () => {
expect(formatSignificance('improvement', -2.5)).toBe('z=-2.5')
})
it('formats noisy as descriptive text', () => {
expect(formatSignificance('noisy', null)).toBe('variance too high')
})
it('formats neutral with z-score without emoji', () => {
expect(formatSignificance('neutral', 0.5)).toBe('z=0.5')
})
it('formats neutral without z-score as dash', () => {
expect(formatSignificance('neutral', null)).toBe('—')
})
})
describe('isNoteworthy', () => {
it('returns true for regressions', () => {
expect(isNoteworthy('regression')).toBe(true)
})
it('returns false for non-regressions', () => {
expect(isNoteworthy('improvement')).toBe(false)
expect(isNoteworthy('neutral')).toBe(false)
expect(isNoteworthy('noisy')).toBe(false)
})
})

63
scripts/perf-stats.ts Normal file
View File

@@ -0,0 +1,63 @@
export interface MetricStats {
mean: number
stddev: number
min: number
max: number
n: number
}
export function computeStats(values: number[]): MetricStats {
const n = values.length
if (n === 0) return { mean: 0, stddev: 0, min: 0, max: 0, n: 0 }
if (n === 1)
return { mean: values[0], stddev: 0, min: values[0], max: values[0], n: 1 }
const mean = values.reduce((a, b) => a + b, 0) / n
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1)
return {
mean,
stddev: Math.sqrt(variance),
min: Math.min(...values),
max: Math.max(...values),
n
}
}
export function zScore(value: number, stats: MetricStats): number | null {
if (stats.stddev === 0 || stats.n < 2) return null
return (value - stats.mean) / stats.stddev
}
export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
export function classifyChange(
z: number | null,
historicalCV: number
): Significance {
if (historicalCV > 50) return 'noisy'
if (z === null) return 'neutral'
if (z > 2) return 'regression'
if (z < -2) return 'improvement'
return 'neutral'
}
export function formatSignificance(
sig: Significance,
z: number | null
): string {
switch (sig) {
case 'regression':
return `⚠️ z=${z!.toFixed(1)}`
case 'improvement':
return `z=${z!.toFixed(1)}`
case 'noisy':
return 'variance too high'
case 'neutral':
return z !== null ? `z=${z.toFixed(1)}` : '—'
}
}
export function isNoteworthy(sig: Significance): boolean {
return sig === 'regression'
}

View File

@@ -618,7 +618,8 @@ export default defineConfig({
retry: process.env.CI ? 2 : 0,
include: [
'src/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
'scripts/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
],
coverage: {
reporter: ['text', 'json', 'html']