perf: add layout/GC metrics + reduce false positives in regression detection (#10477)

## Summary Add layout duration, style recalc duration, and heap usage metrics to CI perf reports, while improving statistical reliability to reduce false positive regressions. ## Changes - **What**: - Collect `layoutDurationMs`, `styleRecalcDurationMs`, `heapUsedBytes` (absolute snapshot) alongside existing metrics - Add effect size gate (`minAbsDelta`) for integer-quantized count metrics (style recalcs, layouts, DOM nodes, event listeners) — prevents z=7.2 false positives from e.g. 11→12 style recalcs - Switch from mean to **median** for PR metric aggregation — robust to outlier CI runs that dominate n=3 mean - Increase historical baseline window from **5 to 15 runs** for more stable σ estimates - Reorder reported metrics: layout/style duration first (actionable), counts and heap after (informational) ## Review Focus The effect size gate in `classifyChange()` — it now requires both z > 2 AND absolute delta ≥ `minAbsDelta` (when configured) to flag a regression. This addresses the core false positive issue where integer metrics with near-zero historical variance produce extreme z-scores for trivial changes. Median vs mean tradeoff: median is more robust to outliers but less sensitive to real shifts — acceptable given n=3 and CI noise levels. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-10477-perf-add-layout-GC-metrics-reduce-false-positives-in-regression-detection-32d6d73d365081daa72cec96d8a07b90) by [Unito](https://www.unito.io)
2026-04-20 06:20:11 +00:00 · 2026-03-25 10:16:56 -07:00
parent 975393b48b
commit 437f41c553
5 changed files with 110 additions and 37 deletions
--- a/.github/workflows/pr-report.yaml
+++ b/.github/workflows/pr-report.yaml
@@ -180,7 +180,7 @@ jobs:
          if git ls-remote --exit-code origin perf-data >/dev/null 2>&1; then
            git fetch origin perf-data --depth=1
            mkdir -p temp/perf-history
-            for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -10); do
+            for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -15); do
              git show "origin/perf-data:${file}" > "temp/perf-history/$(basename "$file")" 2>/dev/null || true
            done
            echo "Loaded $(ls temp/perf-history/*.json 2>/dev/null | wc -l) historical baselines"
--- a/browser_tests/fixtures/helpers/PerformanceHelper.ts
+++ b/browser_tests/fixtures/helpers/PerformanceHelper.ts
@@ -23,6 +23,7 @@ export interface PerfMeasurement {
  layoutDurationMs: number
  taskDurationMs: number
  heapDeltaBytes: number
+  heapUsedBytes: number
  domNodes: number
  jsHeapTotalBytes: number
  scriptDurationMs: number
@@ -190,6 +191,7 @@ export class PerformanceHelper {
      layoutDurationMs: delta('LayoutDuration') * 1000,
      taskDurationMs: delta('TaskDuration') * 1000,
      heapDeltaBytes: delta('JSHeapUsedSize'),
+      heapUsedBytes: after.JSHeapUsedSize,
      domNodes: delta('Nodes'),
      jsHeapTotalBytes: delta('JSHeapTotalSize'),
      scriptDurationMs: delta('ScriptDuration') * 1000,
--- a/scripts/perf-report.ts
+++ b/scripts/perf-report.ts
@@ -22,6 +22,7 @@ interface PerfMeasurement {
  layoutDurationMs: number
  taskDurationMs: number
  heapDeltaBytes: number
+  heapUsedBytes: number
  domNodes: number
  jsHeapTotalBytes: number
  scriptDurationMs: number
@@ -43,22 +44,46 @@ const HISTORY_DIR = 'temp/perf-history'

 type MetricKey =
  | 'styleRecalcs'
+  | 'styleRecalcDurationMs'
  | 'layouts'
+  | 'layoutDurationMs'
  | 'taskDurationMs'
  | 'domNodes'
  | 'scriptDurationMs'
  | 'eventListeners'
  | 'totalBlockingTimeMs'
  | 'frameDurationMs'
-const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
-  { key: 'styleRecalcs', label: 'style recalcs', unit: '' },
-  { key: 'layouts', label: 'layouts', unit: '' },
+  | 'heapUsedBytes'
+
+interface MetricDef {
+  key: MetricKey
+  label: string
+  unit: string
+  /** Minimum absolute delta to consider meaningful (effect size gate) */
+  minAbsDelta?: number
+}
+
+const REPORTED_METRICS: MetricDef[] = [
+  { key: 'layoutDurationMs', label: 'layout duration', unit: 'ms' },
+  {
+    key: 'styleRecalcDurationMs',
+    label: 'style recalc duration',
+    unit: 'ms'
+  },
+  { key: 'layouts', label: 'layout count', unit: '', minAbsDelta: 5 },
+  {
+    key: 'styleRecalcs',
+    label: 'style recalc count',
+    unit: '',
+    minAbsDelta: 5
+  },
  { key: 'taskDurationMs', label: 'task duration', unit: 'ms' },
-  { key: 'domNodes', label: 'DOM nodes', unit: '' },
  { key: 'scriptDurationMs', label: 'script duration', unit: 'ms' },
-  { key: 'eventListeners', label: 'event listeners', unit: '' },
  { key: 'totalBlockingTimeMs', label: 'TBT', unit: 'ms' },
-  { key: 'frameDurationMs', label: 'frame duration', unit: 'ms' }
+  { key: 'frameDurationMs', label: 'frame duration', unit: 'ms' },
+  { key: 'heapUsedBytes', label: 'heap used', unit: 'bytes' },
+  { key: 'domNodes', label: 'DOM nodes', unit: '', minAbsDelta: 5 },
+  { key: 'eventListeners', label: 'event listeners', unit: '', minAbsDelta: 5 }
 ]

 function groupByName(
@@ -134,7 +159,9 @@ function computeCV(stats: MetricStats): number {
 }

 function formatValue(value: number, unit: string): string {
-  return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
+  if (unit === 'ms') return `${value.toFixed(0)}ms`
+  if (unit === 'bytes') return formatBytes(value)
+  return `${value.toFixed(0)}`
 }

 function formatDelta(pct: number | null): string {
@@ -159,6 +186,21 @@ function meanMetric(samples: PerfMeasurement[], key: MetricKey): number | null {
  return values.reduce((sum, v) => sum + v, 0) / values.length
 }

+function medianMetric(
+  samples: PerfMeasurement[],
+  key: MetricKey
+): number | null {
+  const values = samples
+    .map((s) => getMetricValue(s, key))
+    .filter((v): v is number => v !== null)
+    .sort((a, b) => a - b)
+  if (values.length === 0) return null
+  const mid = Math.floor(values.length / 2)
+  return values.length % 2 === 0
+    ? (values[mid - 1] + values[mid]) / 2
+    : values[mid]
+}
+
 function formatBytes(bytes: number): string {
  if (Math.abs(bytes) < 1024) return `${bytes} B`
  if (Math.abs(bytes) < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
@@ -173,7 +215,7 @@ function renderFullReport(
  const lines: string[] = []
  const baselineGroups = groupByName(baseline.measurements)
  const tableHeader = [
-    '| Metric | Baseline | PR (n=3) | Δ | Sig |',
+    '| Metric | Baseline | PR (median) | Δ | Sig |',
    '|--------|----------|----------|---|-----|'
  ]

@@ -183,36 +225,38 @@ function renderFullReport(
  for (const [testName, prSamples] of prGroups) {
    const baseSamples = baselineGroups.get(testName)

-    for (const { key, label, unit } of REPORTED_METRICS) {
-      const prMean = meanMetric(prSamples, key)
-      if (prMean === null) continue
+    for (const { key, label, unit, minAbsDelta } of REPORTED_METRICS) {
+      // Use median for PR values — robust to outlier runs in CI
+      const prVal = medianMetric(prSamples, key)
+      if (prVal === null) continue
      const histStats = getHistoricalStats(historical, testName, key)
      const cv = computeCV(histStats)

      if (!baseSamples?.length) {
        allRows.push(
-          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
+          `| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
        )
        continue
      }

-      const baseVal = meanMetric(baseSamples, key)
+      const baseVal = medianMetric(baseSamples, key)
      if (baseVal === null) {
        allRows.push(
-          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
+          `| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
        )
        continue
      }
+      const absDelta = prVal - baseVal
      const deltaPct =
        baseVal === 0
-          ? prMean === 0
+          ? prVal === 0
            ? 0
            : null
-          : ((prMean - baseVal) / baseVal) * 100
-      const z = zScore(prMean, histStats)
-      const sig = classifyChange(z, cv)
+          : ((prVal - baseVal) / baseVal) * 100
+      const z = zScore(prVal, histStats)
+      const sig = classifyChange(z, cv, absDelta, minAbsDelta)

-      const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
+      const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
      allRows.push(row)
      if (isNoteworthy(sig)) {
        flaggedRows.push(row)
@@ -299,7 +343,7 @@ function renderColdStartReport(
  const lines: string[] = []
  const baselineGroups = groupByName(baseline.measurements)
  lines.push(
-    `> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
+    `> ℹ️ Collecting baseline variance data (${historicalCount}/15 runs). Significance will appear after 2 main branch runs.`,
    '',
    '| Metric | Baseline | PR | Δ |',
    '|--------|----------|-----|---|'
@@ -309,31 +353,31 @@ function renderColdStartReport(
    const baseSamples = baselineGroups.get(testName)

    for (const { key, label, unit } of REPORTED_METRICS) {
-      const prMean = meanMetric(prSamples, key)
-      if (prMean === null) continue
+      const prVal = medianMetric(prSamples, key)
+      if (prVal === null) continue

      if (!baseSamples?.length) {
        lines.push(
-          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
+          `| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
        )
        continue
      }

-      const baseVal = meanMetric(baseSamples, key)
+      const baseVal = medianMetric(baseSamples, key)
      if (baseVal === null) {
        lines.push(
-          `| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
+          `| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
        )
        continue
      }
      const deltaPct =
        baseVal === 0
-          ? prMean === 0
+          ? prVal === 0
            ? 0
            : null
-          : ((prMean - baseVal) / baseVal) * 100
+          : ((prVal - baseVal) / baseVal) * 100
      lines.push(
-        `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
+        `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} |`
      )
    }
  }
@@ -352,14 +396,10 @@ function renderNoBaselineReport(
  )
  for (const [testName, prSamples] of prGroups) {
    for (const { key, label, unit } of REPORTED_METRICS) {
-      const prMean = meanMetric(prSamples, key)
-      if (prMean === null) continue
-      lines.push(`| ${testName}: ${label} | ${formatValue(prMean, unit)} |`)
+      const prVal = medianMetric(prSamples, key)
+      if (prVal === null) continue
+      lines.push(`| ${testName}: ${label} | ${formatValue(prVal, unit)} |`)
    }
-    const heapMean =
-      prSamples.reduce((sum, s) => sum + (s.heapDeltaBytes ?? 0), 0) /
-      prSamples.length
-    lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
  }
  return lines
 }
--- a/scripts/perf-stats.test.ts
+++ b/scripts/perf-stats.test.ts
@@ -99,6 +99,21 @@ describe('classifyChange', () => {
    expect(classifyChange(2, 10)).toBe('neutral')
    expect(classifyChange(-2, 10)).toBe('neutral')
  })
+
+  it('returns neutral when absDelta below minAbsDelta despite high z', () => {
+    // z=7.2 but only 1 unit change with minAbsDelta=5
+    expect(classifyChange(7.2, 10, 1, 5)).toBe('neutral')
+    expect(classifyChange(-7.2, 10, -1, 5)).toBe('neutral')
+  })
+
+  it('returns regression when absDelta meets minAbsDelta', () => {
+    expect(classifyChange(3, 10, 10, 5)).toBe('regression')
+  })
+
+  it('ignores effect size gate when minAbsDelta not provided', () => {
+    expect(classifyChange(3, 10)).toBe('regression')
+    expect(classifyChange(3, 10, 1)).toBe('regression')
+  })
 })

 describe('formatSignificance', () => {
--- a/scripts/perf-stats.ts
+++ b/scripts/perf-stats.ts
@@ -31,12 +31,28 @@ export function zScore(value: number, stats: MetricStats): number | null {

 export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'

+/**
+ * Classify a metric change as regression/improvement/neutral/noisy.
+ *
+ * Uses both statistical significance (z-score) and practical significance
+ * (effect size gate via minAbsDelta) to reduce false positives from
+ * integer-quantized metrics with near-zero variance.
+ */
 export function classifyChange(
  z: number | null,
-  historicalCV: number
+  historicalCV: number,
+  absDelta?: number,
+  minAbsDelta?: number
 ): Significance {
  if (historicalCV > 50) return 'noisy'
  if (z === null) return 'neutral'
+
+  // Effect size gate: require minimum absolute change for count metrics
+  // to avoid flagging e.g. 11→12 style recalcs as z=7.2 regression.
+  if (minAbsDelta !== undefined && absDelta !== undefined) {
+    if (Math.abs(absDelta) < minAbsDelta) return 'neutral'
+  }
+
  if (z > 2) return 'regression'
  if (z < -2) return 'improvement'
  return 'neutral'