feat: add statistical significance to perf report with z-score thresholds (#9305)

## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io)
2026-04-22 07:19:41 +00:00 · 2026-03-04 16:16:53 -08:00
parent b8edb11ac1
commit 7b316eb9a2
5 changed files with 471 additions and 62 deletions
--- a/scripts/perf-report.ts
+++ b/scripts/perf-report.ts
@@ -1,4 +1,14 @@
-import { existsSync, readFileSync } from 'node:fs'
+import { existsSync, readFileSync, readdirSync } from 'node:fs'
+import { join } from 'node:path'
+
+import type { MetricStats } from './perf-stats'
+import {
+  classifyChange,
+  computeStats,
+  formatSignificance,
+  isNoteworthy,
+  zScore
+} from './perf-stats'

 interface PerfMeasurement {
  name: string
@@ -20,12 +30,76 @@ interface PerfReport {

 const CURRENT_PATH = 'test-results/perf-metrics.json'
 const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
+const HISTORY_DIR = 'temp/perf-history'

-function formatDelta(pct: number): string {
-  if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
-  if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
-  if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
-  return `${pct.toFixed(0)}% 🟢`
+type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
+const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
+  { key: 'styleRecalcs', label: 'style recalcs', unit: '' },
+  { key: 'layouts', label: 'layouts', unit: '' },
+  { key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
+]
+
+function groupByName(
+  measurements: PerfMeasurement[]
+): Map<string, PerfMeasurement[]> {
+  const map = new Map<string, PerfMeasurement[]>()
+  for (const m of measurements) {
+    const list = map.get(m.name) ?? []
+    list.push(m)
+    map.set(m.name, list)
+  }
+  return map
+}
+
+function loadHistoricalReports(): PerfReport[] {
+  if (!existsSync(HISTORY_DIR)) return []
+  const reports: PerfReport[] = []
+  for (const dir of readdirSync(HISTORY_DIR)) {
+    const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
+    if (!existsSync(filePath)) continue
+    try {
+      reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
+    } catch {
+      console.warn(`Skipping malformed perf history: ${filePath}`)
+    }
+  }
+  return reports
+}
+
+function getHistoricalStats(
+  reports: PerfReport[],
+  testName: string,
+  metric: MetricKey
+): MetricStats {
+  const values: number[] = []
+  for (const r of reports) {
+    const group = groupByName(r.measurements)
+    const samples = group.get(testName)
+    if (samples) {
+      const mean =
+        samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
+      values.push(mean)
+    }
+  }
+  return computeStats(values)
+}
+
+function computeCV(stats: MetricStats): number {
+  return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
+}
+
+function formatValue(value: number, unit: string): string {
+  return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
+}
+
+function formatDelta(pct: number | null): string {
+  if (pct === null) return '—'
+  const sign = pct >= 0 ? '+' : ''
+  return `${sign}${pct.toFixed(0)}%`
+}
+
+function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
+  return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
 }

 function formatBytes(bytes: number): string {
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
 }

-function calcDelta(
-  baseline: number,
-  current: number
-): { pct: number; isNew: boolean } {
-  if (baseline > 0) {
-    return { pct: ((current - baseline) / baseline) * 100, isNew: false }
+function renderFullReport(
+  prGroups: Map<string, PerfMeasurement[]>,
+  baseline: PerfReport,
+  historical: PerfReport[]
+): string[] {
+  const lines: string[] = []
+  const baselineGroups = groupByName(baseline.measurements)
+  const tableHeader = [
+    '| Metric | Baseline | PR (n=3) | Δ | Sig |',
+    '|--------|----------|----------|---|-----|'
+  ]
+
+  const flaggedRows: string[] = []
+  const allRows: string[] = []
+
+  for (const [testName, prSamples] of prGroups) {
+    const baseSamples = baselineGroups.get(testName)
+
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const prValues = prSamples.map((s) => s[key])
+      const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
+      const histStats = getHistoricalStats(historical, testName, key)
+      const cv = computeCV(histStats)
+
+      if (!baseSamples?.length) {
+        allRows.push(
+          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
+        )
+        continue
+      }
+
+      const baseVal = meanMetric(baseSamples, key)
+      const deltaPct =
+        baseVal === 0
+          ? prMean === 0
+            ? 0
+            : null
+          : ((prMean - baseVal) / baseVal) * 100
+      const z = zScore(prMean, histStats)
+      const sig = classifyChange(z, cv)
+
+      const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
+      allRows.push(row)
+      if (isNoteworthy(sig)) {
+        flaggedRows.push(row)
+      }
+    }
  }
-  return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
+
+  if (flaggedRows.length > 0) {
+    lines.push(
+      `⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
+      '',
+      ...tableHeader,
+      ...flaggedRows,
+      ''
+    )
+  } else {
+    lines.push('No regressions detected.', '')
+  }
+
+  lines.push(
+    `<details><summary>All metrics</summary>`,
+    '',
+    ...tableHeader,
+    ...allRows,
+    '',
+    '</details>',
+    ''
+  )
+
+  lines.push(
+    `<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
+    '',
+    '| Metric | μ | σ | CV |',
+    '|--------|---|---|-----|'
+  )
+  for (const [testName] of prGroups) {
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const stats = getHistoricalStats(historical, testName, key)
+      if (stats.n < 2) continue
+      const cv = computeCV(stats)
+      lines.push(
+        `| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
+      )
+    }
+  }
+  lines.push('', '</details>')
+
+  return lines
 }

-function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
-  return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
+function renderColdStartReport(
+  prGroups: Map<string, PerfMeasurement[]>,
+  baseline: PerfReport,
+  historicalCount: number
+): string[] {
+  const lines: string[] = []
+  const baselineGroups = groupByName(baseline.measurements)
+  lines.push(
+    `> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
+    '',
+    '| Metric | Baseline | PR | Δ |',
+    '|--------|----------|-----|---|'
+  )
+
+  for (const [testName, prSamples] of prGroups) {
+    const baseSamples = baselineGroups.get(testName)
+
+    for (const { key, label, unit } of REPORTED_METRICS) {
+      const prValues = prSamples.map((s) => s[key])
+      const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
+
+      if (!baseSamples?.length) {
+        lines.push(
+          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
+        )
+        continue
+      }
+
+      const baseVal = meanMetric(baseSamples, key)
+      const deltaPct =
+        baseVal === 0
+          ? prMean === 0
+            ? 0
+            : null
+          : ((prMean - baseVal) / baseVal) * 100
+      lines.push(
+        `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
+      )
+    }
+  }
+
+  return lines
+}
+
+function renderNoBaselineReport(
+  prGroups: Map<string, PerfMeasurement[]>
+): string[] {
+  const lines: string[] = []
+  lines.push(
+    'No baseline found — showing absolute values.\n',
+    '| Metric | Value |',
+    '|--------|-------|'
+  )
+  for (const [testName, prSamples] of prGroups) {
+    const prMean = (key: MetricKey) =>
+      prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
+
+    lines.push(
+      `| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
+    )
+    lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
+    lines.push(
+      `| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
+    )
+    const heapMean =
+      prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
+    lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
+  }
+  return lines
 }

 function main() {
@@ -62,55 +285,18 @@ function main() {
    ? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
    : null

+  const historical = loadHistoricalReports()
+  const prGroups = groupByName(current.measurements)
+
  const lines: string[] = []
  lines.push('## ⚡ Performance Report\n')

-  if (baseline) {
-    lines.push(
-      '| Metric | Baseline | PR | Δ |',
-      '|--------|----------|-----|---|'
-    )
-
-    for (const m of current.measurements) {
-      const base = baseline.measurements.find((b) => b.name === m.name)
-      if (!base) {
-        lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
-        lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
-        lines.push(
-          `| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
-        )
-        continue
-      }
-
-      const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
-      lines.push(
-        `| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
-      )
-
-      const layoutDelta = calcDelta(base.layouts, m.layouts)
-      lines.push(
-        `| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
-      )
-
-      const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
-      lines.push(
-        `| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
-      )
-    }
+  if (baseline && historical.length >= 2) {
+    lines.push(...renderFullReport(prGroups, baseline, historical))
+  } else if (baseline) {
+    lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
  } else {
-    lines.push(
-      'No baseline found — showing absolute values.\n',
-      '| Metric | Value |',
-      '|--------|-------|'
-    )
-    for (const m of current.measurements) {
-      lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
-      lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
-      lines.push(
-        `| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
-      )
-      lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
-    }
+    lines.push(...renderNoBaselineReport(prGroups))
  }

  lines.push('\n<details><summary>Raw data</summary>\n')