feat: add statistical significance to perf report with z-score thresholds (#9305)

## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io)
2026-05-11 00:10:40 +00:00 · 2026-03-04 16:16:53 -08:00
parent b8edb11ac1
commit 7b316eb9a2
5 changed files with 471 additions and 62 deletions
--- a/.github/workflows/ci-perf-report.yaml
+++ b/.github/workflows/ci-perf-report.yaml
@@ -45,7 +45,7 @@ jobs:
      - name: Run performance tests
        id: perf
        continue-on-error: true
-        run: pnpm exec playwright test --project=performance --workers=1
+        run: pnpm exec playwright test --project=performance --workers=1 --repeat-each=3

      - name: Upload perf metrics
        if: always()
@@ -61,6 +61,7 @@ jobs:
    if: github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    permissions:
+      actions: read
      contents: read
      pull-requests: write

@@ -90,6 +91,31 @@ jobs:
          path: temp/perf-baseline/
          if_no_artifact_found: warn

+      - name: Download historical perf baselines
+        continue-on-error: true
+        run: |
+          RUNS=$(gh api \
+            "/repos/${{ github.repository }}/actions/workflows/ci-perf-report.yaml/runs?branch=${{ github.event.pull_request.base.ref }}&event=push&status=success&per_page=5" \
+            --jq '.workflow_runs[].id' || true)
+
+          if [ -z "$RUNS" ]; then
+            echo "No historical runs available"
+            exit 0
+          fi
+
+          mkdir -p temp/perf-history
+          INDEX=0
+          for RUN_ID in $RUNS; do
+            DIR="temp/perf-history/$INDEX"
+            mkdir -p "$DIR"
+            gh run download "$RUN_ID" -n perf-metrics -D "$DIR/" 2>/dev/null || true
+            INDEX=$((INDEX + 1))
+          done
+
+          echo "Downloaded $(ls temp/perf-history/*/perf-metrics.json 2>/dev/null | wc -l) historical baselines"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Generate perf report
        run: npx --yes tsx scripts/perf-report.ts > perf-report.md

--- a/scripts/perf-report.ts
+++ b/scripts/perf-report.ts
@@ -1,4 +1,14 @@
-import { existsSync, readFileSync } from 'node:fs'
+import { existsSync, readFileSync, readdirSync } from 'node:fs'
+import { join } from 'node:path'
+
+import type { MetricStats } from './perf-stats'
+import {
+  classifyChange,
+  computeStats,
+  formatSignificance,
+  isNoteworthy,
+  zScore
+} from './perf-stats'

 interface PerfMeasurement {
  name: string
@@ -20,12 +30,76 @@ interface PerfReport {

 const CURRENT_PATH = 'test-results/perf-metrics.json'
 const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
+const HISTORY_DIR = 'temp/perf-history'

-function formatDelta(pct: number): string {
-  if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
-  if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
-  if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
-  return `${pct.toFixed(0)}% 🟢`
+type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
+const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
+  { key: 'styleRecalcs', label: 'style recalcs', unit: '' },
+  { key: 'layouts', label: 'layouts', unit: '' },
+  { key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
+]
+
+function groupByName(
+  measurements: PerfMeasurement[]
+): Map<string, PerfMeasurement[]> {
+  const map = new Map<string, PerfMeasurement[]>()
+  for (const m of measurements) {
+    const list = map.get(m.name) ?? []
+    list.push(m)
+    map.set(m.name, list)
+  }
+  return map
+}
+
+function loadHistoricalReports(): PerfReport[] {
+  if (!existsSync(HISTORY_DIR)) return []
+  const reports: PerfReport[] = []
+  for (const dir of readdirSync(HISTORY_DIR)) {
+    const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
+    if (!existsSync(filePath)) continue
+    try {
+      reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
+    } catch {
+      console.warn(`Skipping malformed perf history: ${filePath}`)
+    }
+  }
+  return reports
+}
+
+function getHistoricalStats(
+  reports: PerfReport[],
+  testName: string,
+  metric: MetricKey
+): MetricStats {
+  const values: number[] = []
+  for (const r of reports) {
+    const group = groupByName(r.measurements)
+    const samples = group.get(testName)
+    if (samples) {
+      const mean =
+        samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
+      values.push(mean)
+    }
+  }
+  return computeStats(values)
+}
+
+function computeCV(stats: MetricStats): number {
+  return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
+}
+
+function formatValue(value: number, unit: string): string {
+  return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
+}
+
+function formatDelta(pct: number | null): string {
+  if (pct === null) return '—'
+  const sign = pct >= 0 ? '+' : ''
+  return `${sign}${pct.toFixed(0)}%`
+}
+
+function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
+  return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
 }

 function formatBytes(bytes: number): string {
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
 }

-function calcDelta(
-  baseline: number,
-  current: number
-): { pct: number; isNew: boolean } {
-  if (baseline > 0) {
-    return { pct: ((current - baseline) / baseline) * 100, isNew: false }
+function renderFullReport(
+  prGroups: Map<string, PerfMeasurement[]>,
+  baseline: PerfReport,
+  historical: PerfReport[]
+): string[] {
+  const lines: string[] = []
+  const baselineGroups = groupByName(baseline.measurements)
+  const tableHeader = [
+    '| Metric | Baseline | PR (n=3) | Δ | Sig |',
+    '|--------|----------|----------|---|-----|'
+  ]
+
+  const flaggedRows: string[] = []
+  const allRows: string[] = []
+
+  for (const [testName, prSamples] of prGroups) {
+    const baseSamples = baselineGroups.get(testName)
+
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const prValues = prSamples.map((s) => s[key])
+      const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
+      const histStats = getHistoricalStats(historical, testName, key)
+      const cv = computeCV(histStats)
+
+      if (!baseSamples?.length) {
+        allRows.push(
+          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
+        )
+        continue
+      }
+
+      const baseVal = meanMetric(baseSamples, key)
+      const deltaPct =
+        baseVal === 0
+          ? prMean === 0
+            ? 0
+            : null
+          : ((prMean - baseVal) / baseVal) * 100
+      const z = zScore(prMean, histStats)
+      const sig = classifyChange(z, cv)
+
+      const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
+      allRows.push(row)
+      if (isNoteworthy(sig)) {
+        flaggedRows.push(row)
+      }
+    }
  }
-  return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
+
+  if (flaggedRows.length > 0) {
+    lines.push(
+      `⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
+      '',
+      ...tableHeader,
+      ...flaggedRows,
+      ''
+    )
+  } else {
+    lines.push('No regressions detected.', '')
+  }
+
+  lines.push(
+    `<details><summary>All metrics</summary>`,
+    '',
+    ...tableHeader,
+    ...allRows,
+    '',
+    '</details>',
+    ''
+  )
+
+  lines.push(
+    `<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
+    '',
+    '| Metric | μ | σ | CV |',
+    '|--------|---|---|-----|'
+  )
+  for (const [testName] of prGroups) {
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const stats = getHistoricalStats(historical, testName, key)
+      if (stats.n < 2) continue
+      const cv = computeCV(stats)
+      lines.push(
+        `| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
+      )
+    }
+  }
+  lines.push('', '</details>')
+
+  return lines
 }

-function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
-  return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
+function renderColdStartReport(
+  prGroups: Map<string, PerfMeasurement[]>,
+  baseline: PerfReport,
+  historicalCount: number
+): string[] {
+  const lines: string[] = []
+  const baselineGroups = groupByName(baseline.measurements)
+  lines.push(
+    `> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
+    '',
+    '| Metric | Baseline | PR | Δ |',
+    '|--------|----------|-----|---|'
+  )
+
+  for (const [testName, prSamples] of prGroups) {
+    const baseSamples = baselineGroups.get(testName)
+
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const prValues = prSamples.map((s) => s[key])
+      const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
+
+      if (!baseSamples?.length) {
+        lines.push(
+          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
+        )
+        continue
+      }
+
+      const baseVal = meanMetric(baseSamples, key)
+      const deltaPct =
+        baseVal === 0
+          ? prMean === 0
+            ? 0
+            : null
+          : ((prMean - baseVal) / baseVal) * 100
+      lines.push(
+        `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
+      )
+    }
+  }
+
+  return lines
+}
+
+function renderNoBaselineReport(
+  prGroups: Map<string, PerfMeasurement[]>
+): string[] {
+  const lines: string[] = []
+  lines.push(
+    'No baseline found — showing absolute values.\n',
+    '| Metric | Value |',
+    '|--------|-------|'
+  )
+  for (const [testName, prSamples] of prGroups) {
+    const prMean = (key: MetricKey) =>
+      prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
+
+    lines.push(
+      `| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
+    )
+    lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
+    lines.push(
+      `| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
+    )
+    const heapMean =
+      prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
+    lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
+  }
+  return lines
 }

 function main() {
@@ -62,55 +285,18 @@ function main() {
    ? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
    : null

+  const historical = loadHistoricalReports()
+  const prGroups = groupByName(current.measurements)
+
  const lines: string[] = []
  lines.push('## ⚡ Performance Report\n')

-  if (baseline) {
-    lines.push(
-      '| Metric | Baseline | PR | Δ |',
-      '|--------|----------|-----|---|'
-    )
-
-    for (const m of current.measurements) {
-      const base = baseline.measurements.find((b) => b.name === m.name)
-      if (!base) {
-        lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
-        lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
-        lines.push(
-          `| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
-        )
-        continue
-      }
-
-      const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
-      lines.push(
-        `| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
-      )
-
-      const layoutDelta = calcDelta(base.layouts, m.layouts)
-      lines.push(
-        `| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
-      )
-
-      const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
-      lines.push(
-        `| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
-      )
-    }
+  if (baseline && historical.length >= 2) {
+    lines.push(...renderFullReport(prGroups, baseline, historical))
+  } else if (baseline) {
+    lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
  } else {
-    lines.push(
-      'No baseline found — showing absolute values.\n',
-      '| Metric | Value |',
-      '|--------|-------|'
-    )
-    for (const m of current.measurements) {
-      lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
-      lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
-      lines.push(
-        `| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
-      )
-      lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
-    }
+    lines.push(...renderNoBaselineReport(prGroups))
  }

  lines.push('\n<details><summary>Raw data</summary>\n')
--- a/scripts/perf-stats.test.ts
+++ b/scripts/perf-stats.test.ts
@@ -0,0 +1,133 @@
+import { describe, expect, it } from 'vitest'
+
+import {
+  classifyChange,
+  computeStats,
+  formatSignificance,
+  isNoteworthy,
+  zScore
+} from './perf-stats'
+
+describe('computeStats', () => {
+  it('returns zeros for empty array', () => {
+    const stats = computeStats([])
+    expect(stats).toEqual({ mean: 0, stddev: 0, min: 0, max: 0, n: 0 })
+  })
+
+  it('returns value with zero stddev for single element', () => {
+    const stats = computeStats([42])
+    expect(stats).toEqual({ mean: 42, stddev: 0, min: 42, max: 42, n: 1 })
+  })
+
+  it('computes correct stats for known values', () => {
+    // Values: [2, 4, 4, 4, 5, 5, 7, 9]
+    // Mean = 5, sample variance ≈ 4.57, sample stddev ≈ 2.14
+    const stats = computeStats([2, 4, 4, 4, 5, 5, 7, 9])
+    expect(stats.mean).toBe(5)
+    expect(stats.stddev).toBeCloseTo(2.138, 2)
+    expect(stats.min).toBe(2)
+    expect(stats.max).toBe(9)
+    expect(stats.n).toBe(8)
+  })
+
+  it('uses sample stddev (n-1 denominator)', () => {
+    // [10, 20] → mean=15, variance=(25+25)/1=50, stddev≈7.07
+    const stats = computeStats([10, 20])
+    expect(stats.mean).toBe(15)
+    expect(stats.stddev).toBeCloseTo(7.071, 2)
+    expect(stats.n).toBe(2)
+  })
+
+  it('handles identical values', () => {
+    const stats = computeStats([5, 5, 5, 5])
+    expect(stats.mean).toBe(5)
+    expect(stats.stddev).toBe(0)
+  })
+})
+
+describe('zScore', () => {
+  it('returns null when stddev is 0', () => {
+    const stats = computeStats([5, 5, 5])
+    expect(zScore(10, stats)).toBeNull()
+  })
+
+  it('returns null when n < 2', () => {
+    const stats = computeStats([5])
+    expect(zScore(10, stats)).toBeNull()
+  })
+
+  it('computes correct z-score', () => {
+    const stats = { mean: 100, stddev: 10, min: 80, max: 120, n: 5 }
+    expect(zScore(120, stats)).toBe(2)
+    expect(zScore(80, stats)).toBe(-2)
+    expect(zScore(100, stats)).toBe(0)
+  })
+})
+
+describe('classifyChange', () => {
+  it('returns noisy when CV > 50%', () => {
+    expect(classifyChange(3, 60)).toBe('noisy')
+    expect(classifyChange(-3, 51)).toBe('noisy')
+  })
+
+  it('does not classify as noisy when CV is exactly 50%', () => {
+    expect(classifyChange(3, 50)).toBe('regression')
+    expect(classifyChange(-3, 50)).toBe('improvement')
+  })
+
+  it('returns neutral when z is null', () => {
+    expect(classifyChange(null, 10)).toBe('neutral')
+  })
+
+  it('returns regression when z > 2', () => {
+    expect(classifyChange(2.1, 10)).toBe('regression')
+    expect(classifyChange(5, 10)).toBe('regression')
+  })
+
+  it('returns improvement when z < -2', () => {
+    expect(classifyChange(-2.1, 10)).toBe('improvement')
+    expect(classifyChange(-5, 10)).toBe('improvement')
+  })
+
+  it('returns neutral when z is within [-2, 2]', () => {
+    expect(classifyChange(0, 10)).toBe('neutral')
+    expect(classifyChange(1.9, 10)).toBe('neutral')
+    expect(classifyChange(-1.9, 10)).toBe('neutral')
+    expect(classifyChange(2, 10)).toBe('neutral')
+    expect(classifyChange(-2, 10)).toBe('neutral')
+  })
+})
+
+describe('formatSignificance', () => {
+  it('formats regression with z-score and emoji', () => {
+    expect(formatSignificance('regression', 3.2)).toBe('⚠️ z=3.2')
+  })
+
+  it('formats improvement with z-score without emoji', () => {
+    expect(formatSignificance('improvement', -2.5)).toBe('z=-2.5')
+  })
+
+  it('formats noisy as descriptive text', () => {
+    expect(formatSignificance('noisy', null)).toBe('variance too high')
+  })
+
+  it('formats neutral with z-score without emoji', () => {
+    expect(formatSignificance('neutral', 0.5)).toBe('z=0.5')
+  })
+
+  it('formats neutral without z-score as dash', () => {
+    expect(formatSignificance('neutral', null)).toBe('—')
+  })
+})
+
+describe('isNoteworthy', () => {
+  it('returns true for regressions', () => {
+    expect(isNoteworthy('regression')).toBe(true)
+  })
+
+  it('returns false for non-regressions', () => {
+    expect(isNoteworthy('improvement')).toBe(false)
+    expect(isNoteworthy('neutral')).toBe(false)
+    expect(isNoteworthy('noisy')).toBe(false)
+  })
+})
--- a/scripts/perf-stats.ts
+++ b/scripts/perf-stats.ts
@@ -0,0 +1,63 @@
+export interface MetricStats {
+  mean: number
+  stddev: number
+  min: number
+  max: number
+  n: number
+}
+
+export function computeStats(values: number[]): MetricStats {
+  const n = values.length
+  if (n === 0) return { mean: 0, stddev: 0, min: 0, max: 0, n: 0 }
+  if (n === 1)
+    return { mean: values[0], stddev: 0, min: values[0], max: values[0], n: 1 }
+
+  const mean = values.reduce((a, b) => a + b, 0) / n
+  const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1)
+
+  return {
+    mean,
+    stddev: Math.sqrt(variance),
+    min: Math.min(...values),
+    max: Math.max(...values),
+    n
+  }
+}
+
+export function zScore(value: number, stats: MetricStats): number | null {
+  if (stats.stddev === 0 || stats.n < 2) return null
+  return (value - stats.mean) / stats.stddev
+}
+
+export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
+
+export function classifyChange(
+  z: number | null,
+  historicalCV: number
+): Significance {
+  if (historicalCV > 50) return 'noisy'
+  if (z === null) return 'neutral'
+  if (z > 2) return 'regression'
+  if (z < -2) return 'improvement'
+  return 'neutral'
+}
+
+export function formatSignificance(
+  sig: Significance,
+  z: number | null
+): string {
+  switch (sig) {
+    case 'regression':
+      return `⚠️ z=${z!.toFixed(1)}`
+    case 'improvement':
+      return `z=${z!.toFixed(1)}`
+    case 'noisy':
+      return 'variance too high'
+    case 'neutral':
+      return z !== null ? `z=${z.toFixed(1)}` : '—'
+  }
+}
+
+export function isNoteworthy(sig: Significance): boolean {
+  return sig === 'regression'
+}
--- a/vite.config.mts
+++ b/vite.config.mts
@@ -618,7 +618,8 @@ export default defineConfig({
    retry: process.env.CI ? 2 : 0,
    include: [
      'src/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
-      'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
+      'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
+      'scripts/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
    ],
    coverage: {
      reporter: ['text', 'json', 'html']