From 7b316eb9a286c44a74a3684f53e4de4cf2467c9a Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Wed, 4 Mar 2026 16:16:53 -0800 Subject: [PATCH] feat: add statistical significance to perf report with z-score thresholds (#9305) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io) --- .github/workflows/ci-perf-report.yaml | 28 ++- scripts/perf-report.ts | 306 +++++++++++++++++++++----- scripts/perf-stats.test.ts | 133 +++++++++++ scripts/perf-stats.ts | 63 ++++++ vite.config.mts | 3 +- 5 files changed, 471 insertions(+), 62 deletions(-) create mode 100644 scripts/perf-stats.test.ts create mode 100644 scripts/perf-stats.ts diff --git a/.github/workflows/ci-perf-report.yaml b/.github/workflows/ci-perf-report.yaml index 88640f1c70..c0de7b8fff 100644 --- a/.github/workflows/ci-perf-report.yaml +++ b/.github/workflows/ci-perf-report.yaml @@ -45,7 +45,7 @@ jobs: - name: Run performance tests id: perf continue-on-error: true - run: pnpm exec playwright test --project=performance --workers=1 + run: pnpm exec playwright test --project=performance --workers=1 --repeat-each=3 - name: Upload perf metrics if: always() @@ -61,6 +61,7 @@ jobs: if: github.event_name == 'pull_request' runs-on: ubuntu-latest permissions: + actions: read contents: read pull-requests: write @@ -90,6 +91,31 @@ jobs: path: temp/perf-baseline/ if_no_artifact_found: warn + - name: Download historical perf baselines + continue-on-error: true + run: | + RUNS=$(gh api \ + "/repos/${{ github.repository }}/actions/workflows/ci-perf-report.yaml/runs?branch=${{ github.event.pull_request.base.ref }}&event=push&status=success&per_page=5" \ + --jq '.workflow_runs[].id' || true) + + if [ -z "$RUNS" ]; then + echo "No historical runs available" + exit 0 + fi + + mkdir -p temp/perf-history + INDEX=0 + for RUN_ID in $RUNS; do + DIR="temp/perf-history/$INDEX" + mkdir -p "$DIR" + gh run download "$RUN_ID" -n perf-metrics -D "$DIR/" 2>/dev/null || true + INDEX=$((INDEX + 1)) + done + + echo "Downloaded $(ls temp/perf-history/*/perf-metrics.json 2>/dev/null | wc -l) historical baselines" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Generate perf report run: npx --yes tsx scripts/perf-report.ts > perf-report.md diff --git a/scripts/perf-report.ts b/scripts/perf-report.ts index b8d288f101..5967c5a59c 100644 --- a/scripts/perf-report.ts +++ b/scripts/perf-report.ts @@ -1,4 +1,14 @@ -import { existsSync, readFileSync } from 'node:fs' +import { existsSync, readFileSync, readdirSync } from 'node:fs' +import { join } from 'node:path' + +import type { MetricStats } from './perf-stats' +import { + classifyChange, + computeStats, + formatSignificance, + isNoteworthy, + zScore +} from './perf-stats' interface PerfMeasurement { name: string @@ -20,12 +30,76 @@ interface PerfReport { const CURRENT_PATH = 'test-results/perf-metrics.json' const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json' +const HISTORY_DIR = 'temp/perf-history' -function formatDelta(pct: number): string { - if (pct >= 20) return `+${pct.toFixed(0)}% 🔴` - if (pct >= 10) return `+${pct.toFixed(0)}% 🟠` - if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪` - return `${pct.toFixed(0)}% 🟢` +type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs' +const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [ + { key: 'styleRecalcs', label: 'style recalcs', unit: '' }, + { key: 'layouts', label: 'layouts', unit: '' }, + { key: 'taskDurationMs', label: 'task duration', unit: 'ms' } +] + +function groupByName( + measurements: PerfMeasurement[] +): Map { + const map = new Map() + for (const m of measurements) { + const list = map.get(m.name) ?? [] + list.push(m) + map.set(m.name, list) + } + return map +} + +function loadHistoricalReports(): PerfReport[] { + if (!existsSync(HISTORY_DIR)) return [] + const reports: PerfReport[] = [] + for (const dir of readdirSync(HISTORY_DIR)) { + const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json') + if (!existsSync(filePath)) continue + try { + reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport) + } catch { + console.warn(`Skipping malformed perf history: ${filePath}`) + } + } + return reports +} + +function getHistoricalStats( + reports: PerfReport[], + testName: string, + metric: MetricKey +): MetricStats { + const values: number[] = [] + for (const r of reports) { + const group = groupByName(r.measurements) + const samples = group.get(testName) + if (samples) { + const mean = + samples.reduce((sum, s) => sum + s[metric], 0) / samples.length + values.push(mean) + } + } + return computeStats(values) +} + +function computeCV(stats: MetricStats): number { + return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0 +} + +function formatValue(value: number, unit: string): string { + return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}` +} + +function formatDelta(pct: number | null): string { + if (pct === null) return '—' + const sign = pct >= 0 ? '+' : '' + return `${sign}${pct.toFixed(0)}%` +} + +function meanMetric(samples: PerfMeasurement[], key: MetricKey): number { + return samples.reduce((sum, s) => sum + s[key], 0) / samples.length } function formatBytes(bytes: number): string { @@ -34,18 +108,167 @@ function formatBytes(bytes: number): string { return `${(bytes / (1024 * 1024)).toFixed(1)} MB` } -function calcDelta( - baseline: number, - current: number -): { pct: number; isNew: boolean } { - if (baseline > 0) { - return { pct: ((current - baseline) / baseline) * 100, isNew: false } +function renderFullReport( + prGroups: Map, + baseline: PerfReport, + historical: PerfReport[] +): string[] { + const lines: string[] = [] + const baselineGroups = groupByName(baseline.measurements) + const tableHeader = [ + '| Metric | Baseline | PR (n=3) | Δ | Sig |', + '|--------|----------|----------|---|-----|' + ] + + const flaggedRows: string[] = [] + const allRows: string[] = [] + + for (const [testName, prSamples] of prGroups) { + const baseSamples = baselineGroups.get(testName) + + for (const { key, label, unit } of REPORTED_METRICS) { + const prValues = prSamples.map((s) => s[key]) + const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length + const histStats = getHistoricalStats(historical, testName, key) + const cv = computeCV(histStats) + + if (!baseSamples?.length) { + allRows.push( + `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |` + ) + continue + } + + const baseVal = meanMetric(baseSamples, key) + const deltaPct = + baseVal === 0 + ? prMean === 0 + ? 0 + : null + : ((prMean - baseVal) / baseVal) * 100 + const z = zScore(prMean, histStats) + const sig = classifyChange(z, cv) + + const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |` + allRows.push(row) + if (isNoteworthy(sig)) { + flaggedRows.push(row) + } + } } - return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false } + + if (flaggedRows.length > 0) { + lines.push( + `⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`, + '', + ...tableHeader, + ...flaggedRows, + '' + ) + } else { + lines.push('No regressions detected.', '') + } + + lines.push( + `
All metrics`, + '', + ...tableHeader, + ...allRows, + '', + '
', + '' + ) + + lines.push( + `
Historical variance (last ${historical.length} runs)`, + '', + '| Metric | μ | σ | CV |', + '|--------|---|---|-----|' + ) + for (const [testName] of prGroups) { + for (const { key, label, unit } of REPORTED_METRICS) { + const stats = getHistoricalStats(historical, testName, key) + if (stats.n < 2) continue + const cv = computeCV(stats) + lines.push( + `| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |` + ) + } + } + lines.push('', '
') + + return lines } -function formatDeltaCell(delta: { pct: number; isNew: boolean }): string { - return delta.isNew ? 'new 🔴' : formatDelta(delta.pct) +function renderColdStartReport( + prGroups: Map, + baseline: PerfReport, + historicalCount: number +): string[] { + const lines: string[] = [] + const baselineGroups = groupByName(baseline.measurements) + lines.push( + `> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`, + '', + '| Metric | Baseline | PR | Δ |', + '|--------|----------|-----|---|' + ) + + for (const [testName, prSamples] of prGroups) { + const baseSamples = baselineGroups.get(testName) + + for (const { key, label, unit } of REPORTED_METRICS) { + const prValues = prSamples.map((s) => s[key]) + const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length + + if (!baseSamples?.length) { + lines.push( + `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |` + ) + continue + } + + const baseVal = meanMetric(baseSamples, key) + const deltaPct = + baseVal === 0 + ? prMean === 0 + ? 0 + : null + : ((prMean - baseVal) / baseVal) * 100 + lines.push( + `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |` + ) + } + } + + return lines +} + +function renderNoBaselineReport( + prGroups: Map +): string[] { + const lines: string[] = [] + lines.push( + 'No baseline found — showing absolute values.\n', + '| Metric | Value |', + '|--------|-------|' + ) + for (const [testName, prSamples] of prGroups) { + const prMean = (key: MetricKey) => + prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length + + lines.push( + `| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |` + ) + lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`) + lines.push( + `| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |` + ) + const heapMean = + prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length + lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`) + } + return lines } function main() { @@ -62,55 +285,18 @@ function main() { ? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8')) : null + const historical = loadHistoricalReports() + const prGroups = groupByName(current.measurements) + const lines: string[] = [] lines.push('## ⚡ Performance Report\n') - if (baseline) { - lines.push( - '| Metric | Baseline | PR | Δ |', - '|--------|----------|-----|---|' - ) - - for (const m of current.measurements) { - const base = baseline.measurements.find((b) => b.name === m.name) - if (!base) { - lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`) - lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`) - lines.push( - `| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |` - ) - continue - } - - const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs) - lines.push( - `| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |` - ) - - const layoutDelta = calcDelta(base.layouts, m.layouts) - lines.push( - `| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |` - ) - - const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs) - lines.push( - `| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |` - ) - } + if (baseline && historical.length >= 2) { + lines.push(...renderFullReport(prGroups, baseline, historical)) + } else if (baseline) { + lines.push(...renderColdStartReport(prGroups, baseline, historical.length)) } else { - lines.push( - 'No baseline found — showing absolute values.\n', - '| Metric | Value |', - '|--------|-------|' - ) - for (const m of current.measurements) { - lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`) - lines.push(`| ${m.name}: layouts | ${m.layouts} |`) - lines.push( - `| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |` - ) - lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`) - } + lines.push(...renderNoBaselineReport(prGroups)) } lines.push('\n
Raw data\n') diff --git a/scripts/perf-stats.test.ts b/scripts/perf-stats.test.ts new file mode 100644 index 0000000000..e78340f1a5 --- /dev/null +++ b/scripts/perf-stats.test.ts @@ -0,0 +1,133 @@ +import { describe, expect, it } from 'vitest' + +import { + classifyChange, + computeStats, + formatSignificance, + isNoteworthy, + zScore +} from './perf-stats' + +describe('computeStats', () => { + it('returns zeros for empty array', () => { + const stats = computeStats([]) + expect(stats).toEqual({ mean: 0, stddev: 0, min: 0, max: 0, n: 0 }) + }) + + it('returns value with zero stddev for single element', () => { + const stats = computeStats([42]) + expect(stats).toEqual({ mean: 42, stddev: 0, min: 42, max: 42, n: 1 }) + }) + + it('computes correct stats for known values', () => { + // Values: [2, 4, 4, 4, 5, 5, 7, 9] + // Mean = 5, sample variance ≈ 4.57, sample stddev ≈ 2.14 + const stats = computeStats([2, 4, 4, 4, 5, 5, 7, 9]) + expect(stats.mean).toBe(5) + expect(stats.stddev).toBeCloseTo(2.138, 2) + expect(stats.min).toBe(2) + expect(stats.max).toBe(9) + expect(stats.n).toBe(8) + }) + + it('uses sample stddev (n-1 denominator)', () => { + // [10, 20] → mean=15, variance=(25+25)/1=50, stddev≈7.07 + const stats = computeStats([10, 20]) + expect(stats.mean).toBe(15) + expect(stats.stddev).toBeCloseTo(7.071, 2) + expect(stats.n).toBe(2) + }) + + it('handles identical values', () => { + const stats = computeStats([5, 5, 5, 5]) + expect(stats.mean).toBe(5) + expect(stats.stddev).toBe(0) + }) +}) + +describe('zScore', () => { + it('returns null when stddev is 0', () => { + const stats = computeStats([5, 5, 5]) + expect(zScore(10, stats)).toBeNull() + }) + + it('returns null when n < 2', () => { + const stats = computeStats([5]) + expect(zScore(10, stats)).toBeNull() + }) + + it('computes correct z-score', () => { + const stats = { mean: 100, stddev: 10, min: 80, max: 120, n: 5 } + expect(zScore(120, stats)).toBe(2) + expect(zScore(80, stats)).toBe(-2) + expect(zScore(100, stats)).toBe(0) + }) +}) + +describe('classifyChange', () => { + it('returns noisy when CV > 50%', () => { + expect(classifyChange(3, 60)).toBe('noisy') + expect(classifyChange(-3, 51)).toBe('noisy') + }) + + it('does not classify as noisy when CV is exactly 50%', () => { + expect(classifyChange(3, 50)).toBe('regression') + expect(classifyChange(-3, 50)).toBe('improvement') + }) + + it('returns neutral when z is null', () => { + expect(classifyChange(null, 10)).toBe('neutral') + }) + + it('returns regression when z > 2', () => { + expect(classifyChange(2.1, 10)).toBe('regression') + expect(classifyChange(5, 10)).toBe('regression') + }) + + it('returns improvement when z < -2', () => { + expect(classifyChange(-2.1, 10)).toBe('improvement') + expect(classifyChange(-5, 10)).toBe('improvement') + }) + + it('returns neutral when z is within [-2, 2]', () => { + expect(classifyChange(0, 10)).toBe('neutral') + expect(classifyChange(1.9, 10)).toBe('neutral') + expect(classifyChange(-1.9, 10)).toBe('neutral') + expect(classifyChange(2, 10)).toBe('neutral') + expect(classifyChange(-2, 10)).toBe('neutral') + }) +}) + +describe('formatSignificance', () => { + it('formats regression with z-score and emoji', () => { + expect(formatSignificance('regression', 3.2)).toBe('⚠️ z=3.2') + }) + + it('formats improvement with z-score without emoji', () => { + expect(formatSignificance('improvement', -2.5)).toBe('z=-2.5') + }) + + it('formats noisy as descriptive text', () => { + expect(formatSignificance('noisy', null)).toBe('variance too high') + }) + + it('formats neutral with z-score without emoji', () => { + expect(formatSignificance('neutral', 0.5)).toBe('z=0.5') + }) + + it('formats neutral without z-score as dash', () => { + expect(formatSignificance('neutral', null)).toBe('—') + }) +}) + +describe('isNoteworthy', () => { + it('returns true for regressions', () => { + expect(isNoteworthy('regression')).toBe(true) + }) + + it('returns false for non-regressions', () => { + expect(isNoteworthy('improvement')).toBe(false) + expect(isNoteworthy('neutral')).toBe(false) + expect(isNoteworthy('noisy')).toBe(false) + }) +}) diff --git a/scripts/perf-stats.ts b/scripts/perf-stats.ts new file mode 100644 index 0000000000..1aab994521 --- /dev/null +++ b/scripts/perf-stats.ts @@ -0,0 +1,63 @@ +export interface MetricStats { + mean: number + stddev: number + min: number + max: number + n: number +} + +export function computeStats(values: number[]): MetricStats { + const n = values.length + if (n === 0) return { mean: 0, stddev: 0, min: 0, max: 0, n: 0 } + if (n === 1) + return { mean: values[0], stddev: 0, min: values[0], max: values[0], n: 1 } + + const mean = values.reduce((a, b) => a + b, 0) / n + const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1) + + return { + mean, + stddev: Math.sqrt(variance), + min: Math.min(...values), + max: Math.max(...values), + n + } +} + +export function zScore(value: number, stats: MetricStats): number | null { + if (stats.stddev === 0 || stats.n < 2) return null + return (value - stats.mean) / stats.stddev +} + +export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy' + +export function classifyChange( + z: number | null, + historicalCV: number +): Significance { + if (historicalCV > 50) return 'noisy' + if (z === null) return 'neutral' + if (z > 2) return 'regression' + if (z < -2) return 'improvement' + return 'neutral' +} + +export function formatSignificance( + sig: Significance, + z: number | null +): string { + switch (sig) { + case 'regression': + return `⚠️ z=${z!.toFixed(1)}` + case 'improvement': + return `z=${z!.toFixed(1)}` + case 'noisy': + return 'variance too high' + case 'neutral': + return z !== null ? `z=${z.toFixed(1)}` : '—' + } +} + +export function isNoteworthy(sig: Significance): boolean { + return sig === 'regression' +} diff --git a/vite.config.mts b/vite.config.mts index 834f6d6794..9033ea16ab 100644 --- a/vite.config.mts +++ b/vite.config.mts @@ -618,7 +618,8 @@ export default defineConfig({ retry: process.env.CI ? 2 : 0, include: [ 'src/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}', - 'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}' + 'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}', + 'scripts/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}' ], coverage: { reporter: ['text', 'json', 'html']