perf: add layout/GC metrics + reduce false positives in regression detection (#10477)

## Summary

Add layout duration, style recalc duration, and heap usage metrics to CI
perf reports, while improving statistical reliability to reduce false
positive regressions.

## Changes

- **What**:
- Collect `layoutDurationMs`, `styleRecalcDurationMs`, `heapUsedBytes`
(absolute snapshot) alongside existing metrics
- Add effect size gate (`minAbsDelta`) for integer-quantized count
metrics (style recalcs, layouts, DOM nodes, event listeners) — prevents
z=7.2 false positives from e.g. 11→12 style recalcs
- Switch from mean to **median** for PR metric aggregation — robust to
outlier CI runs that dominate n=3 mean
- Increase historical baseline window from **5 to 15 runs** for more
stable σ estimates
- Reorder reported metrics: layout/style duration first (actionable),
counts and heap after (informational)

## Review Focus

The effect size gate in `classifyChange()` — it now requires both z > 2
AND absolute delta ≥ `minAbsDelta` (when configured) to flag a
regression. This addresses the core false positive issue where integer
metrics with near-zero historical variance produce extreme z-scores for
trivial changes.

Median vs mean tradeoff: median is more robust to outliers but less
sensitive to real shifts — acceptable given n=3 and CI noise levels.

┆Issue is synchronized with this [Notion
page](https://www.notion.so/PR-10477-perf-add-layout-GC-metrics-reduce-false-positives-in-regression-detection-32d6d73d365081daa72cec96d8a07b90)
by [Unito](https://www.unito.io)
This commit is contained in:
Christian Byrne
2026-03-25 10:16:56 -07:00
committed by GitHub
parent 975393b48b
commit 437f41c553
5 changed files with 110 additions and 37 deletions

View File

@@ -180,7 +180,7 @@ jobs:
if git ls-remote --exit-code origin perf-data >/dev/null 2>&1; then
git fetch origin perf-data --depth=1
mkdir -p temp/perf-history
for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -10); do
for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -15); do
git show "origin/perf-data:${file}" > "temp/perf-history/$(basename "$file")" 2>/dev/null || true
done
echo "Loaded $(ls temp/perf-history/*.json 2>/dev/null | wc -l) historical baselines"

View File

@@ -23,6 +23,7 @@ export interface PerfMeasurement {
layoutDurationMs: number
taskDurationMs: number
heapDeltaBytes: number
heapUsedBytes: number
domNodes: number
jsHeapTotalBytes: number
scriptDurationMs: number
@@ -190,6 +191,7 @@ export class PerformanceHelper {
layoutDurationMs: delta('LayoutDuration') * 1000,
taskDurationMs: delta('TaskDuration') * 1000,
heapDeltaBytes: delta('JSHeapUsedSize'),
heapUsedBytes: after.JSHeapUsedSize,
domNodes: delta('Nodes'),
jsHeapTotalBytes: delta('JSHeapTotalSize'),
scriptDurationMs: delta('ScriptDuration') * 1000,

View File

@@ -22,6 +22,7 @@ interface PerfMeasurement {
layoutDurationMs: number
taskDurationMs: number
heapDeltaBytes: number
heapUsedBytes: number
domNodes: number
jsHeapTotalBytes: number
scriptDurationMs: number
@@ -43,22 +44,46 @@ const HISTORY_DIR = 'temp/perf-history'
type MetricKey =
| 'styleRecalcs'
| 'styleRecalcDurationMs'
| 'layouts'
| 'layoutDurationMs'
| 'taskDurationMs'
| 'domNodes'
| 'scriptDurationMs'
| 'eventListeners'
| 'totalBlockingTimeMs'
| 'frameDurationMs'
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
{ key: 'layouts', label: 'layouts', unit: '' },
| 'heapUsedBytes'
interface MetricDef {
key: MetricKey
label: string
unit: string
/** Minimum absolute delta to consider meaningful (effect size gate) */
minAbsDelta?: number
}
const REPORTED_METRICS: MetricDef[] = [
{ key: 'layoutDurationMs', label: 'layout duration', unit: 'ms' },
{
key: 'styleRecalcDurationMs',
label: 'style recalc duration',
unit: 'ms'
},
{ key: 'layouts', label: 'layout count', unit: '', minAbsDelta: 5 },
{
key: 'styleRecalcs',
label: 'style recalc count',
unit: '',
minAbsDelta: 5
},
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' },
{ key: 'domNodes', label: 'DOM nodes', unit: '' },
{ key: 'scriptDurationMs', label: 'script duration', unit: 'ms' },
{ key: 'eventListeners', label: 'event listeners', unit: '' },
{ key: 'totalBlockingTimeMs', label: 'TBT', unit: 'ms' },
{ key: 'frameDurationMs', label: 'frame duration', unit: 'ms' }
{ key: 'frameDurationMs', label: 'frame duration', unit: 'ms' },
{ key: 'heapUsedBytes', label: 'heap used', unit: 'bytes' },
{ key: 'domNodes', label: 'DOM nodes', unit: '', minAbsDelta: 5 },
{ key: 'eventListeners', label: 'event listeners', unit: '', minAbsDelta: 5 }
]
function groupByName(
@@ -134,7 +159,9 @@ function computeCV(stats: MetricStats): number {
}
function formatValue(value: number, unit: string): string {
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
if (unit === 'ms') return `${value.toFixed(0)}ms`
if (unit === 'bytes') return formatBytes(value)
return `${value.toFixed(0)}`
}
function formatDelta(pct: number | null): string {
@@ -159,6 +186,21 @@ function meanMetric(samples: PerfMeasurement[], key: MetricKey): number | null {
return values.reduce((sum, v) => sum + v, 0) / values.length
}
function medianMetric(
samples: PerfMeasurement[],
key: MetricKey
): number | null {
const values = samples
.map((s) => getMetricValue(s, key))
.filter((v): v is number => v !== null)
.sort((a, b) => a - b)
if (values.length === 0) return null
const mid = Math.floor(values.length / 2)
return values.length % 2 === 0
? (values[mid - 1] + values[mid]) / 2
: values[mid]
}
function formatBytes(bytes: number): string {
if (Math.abs(bytes) < 1024) return `${bytes} B`
if (Math.abs(bytes) < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
@@ -173,7 +215,7 @@ function renderFullReport(
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
const tableHeader = [
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
'| Metric | Baseline | PR (median) | Δ | Sig |',
'|--------|----------|----------|---|-----|'
]
@@ -183,36 +225,38 @@ function renderFullReport(
for (const [testName, prSamples] of prGroups) {
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prMean = meanMetric(prSamples, key)
if (prMean === null) continue
for (const { key, label, unit, minAbsDelta } of REPORTED_METRICS) {
// Use median for PR values — robust to outlier runs in CI
const prVal = medianMetric(prSamples, key)
if (prVal === null) continue
const histStats = getHistoricalStats(historical, testName, key)
const cv = computeCV(histStats)
if (!baseSamples?.length) {
allRows.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const baseVal = medianMetric(baseSamples, key)
if (baseVal === null) {
allRows.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
)
continue
}
const absDelta = prVal - baseVal
const deltaPct =
baseVal === 0
? prMean === 0
? prVal === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
const z = zScore(prMean, histStats)
const sig = classifyChange(z, cv)
: ((prVal - baseVal) / baseVal) * 100
const z = zScore(prVal, histStats)
const sig = classifyChange(z, cv, absDelta, minAbsDelta)
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
allRows.push(row)
if (isNoteworthy(sig)) {
flaggedRows.push(row)
@@ -299,7 +343,7 @@ function renderColdStartReport(
const lines: string[] = []
const baselineGroups = groupByName(baseline.measurements)
lines.push(
`> Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
`> Collecting baseline variance data (${historicalCount}/15 runs). Significance will appear after 2 main branch runs.`,
'',
'| Metric | Baseline | PR | Δ |',
'|--------|----------|-----|---|'
@@ -309,31 +353,31 @@ function renderColdStartReport(
const baseSamples = baselineGroups.get(testName)
for (const { key, label, unit } of REPORTED_METRICS) {
const prMean = meanMetric(prSamples, key)
if (prMean === null) continue
const prVal = medianMetric(prSamples, key)
if (prVal === null) continue
if (!baseSamples?.length) {
lines.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
)
continue
}
const baseVal = meanMetric(baseSamples, key)
const baseVal = medianMetric(baseSamples, key)
if (baseVal === null) {
lines.push(
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
)
continue
}
const deltaPct =
baseVal === 0
? prMean === 0
? prVal === 0
? 0
: null
: ((prMean - baseVal) / baseVal) * 100
: ((prVal - baseVal) / baseVal) * 100
lines.push(
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} |`
)
}
}
@@ -352,14 +396,10 @@ function renderNoBaselineReport(
)
for (const [testName, prSamples] of prGroups) {
for (const { key, label, unit } of REPORTED_METRICS) {
const prMean = meanMetric(prSamples, key)
if (prMean === null) continue
lines.push(`| ${testName}: ${label} | ${formatValue(prMean, unit)} |`)
const prVal = medianMetric(prSamples, key)
if (prVal === null) continue
lines.push(`| ${testName}: ${label} | ${formatValue(prVal, unit)} |`)
}
const heapMean =
prSamples.reduce((sum, s) => sum + (s.heapDeltaBytes ?? 0), 0) /
prSamples.length
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
}
return lines
}

View File

@@ -99,6 +99,21 @@ describe('classifyChange', () => {
expect(classifyChange(2, 10)).toBe('neutral')
expect(classifyChange(-2, 10)).toBe('neutral')
})
it('returns neutral when absDelta below minAbsDelta despite high z', () => {
// z=7.2 but only 1 unit change with minAbsDelta=5
expect(classifyChange(7.2, 10, 1, 5)).toBe('neutral')
expect(classifyChange(-7.2, 10, -1, 5)).toBe('neutral')
})
it('returns regression when absDelta meets minAbsDelta', () => {
expect(classifyChange(3, 10, 10, 5)).toBe('regression')
})
it('ignores effect size gate when minAbsDelta not provided', () => {
expect(classifyChange(3, 10)).toBe('regression')
expect(classifyChange(3, 10, 1)).toBe('regression')
})
})
describe('formatSignificance', () => {

View File

@@ -31,12 +31,28 @@ export function zScore(value: number, stats: MetricStats): number | null {
export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
/**
* Classify a metric change as regression/improvement/neutral/noisy.
*
* Uses both statistical significance (z-score) and practical significance
* (effect size gate via minAbsDelta) to reduce false positives from
* integer-quantized metrics with near-zero variance.
*/
export function classifyChange(
z: number | null,
historicalCV: number
historicalCV: number,
absDelta?: number,
minAbsDelta?: number
): Significance {
if (historicalCV > 50) return 'noisy'
if (z === null) return 'neutral'
// Effect size gate: require minimum absolute change for count metrics
// to avoid flagging e.g. 11→12 style recalcs as z=7.2 regression.
if (minAbsDelta !== undefined && absDelta !== undefined) {
if (Math.abs(absDelta) < minAbsDelta) return 'neutral'
}
if (z > 2) return 'regression'
if (z < -2) return 'improvement'
return 'neutral'