mirror of
https://github.com/Comfy-Org/ComfyUI_frontend.git
synced 2026-04-20 06:20:11 +00:00
perf: add layout/GC metrics + reduce false positives in regression detection (#10477)
## Summary Add layout duration, style recalc duration, and heap usage metrics to CI perf reports, while improving statistical reliability to reduce false positive regressions. ## Changes - **What**: - Collect `layoutDurationMs`, `styleRecalcDurationMs`, `heapUsedBytes` (absolute snapshot) alongside existing metrics - Add effect size gate (`minAbsDelta`) for integer-quantized count metrics (style recalcs, layouts, DOM nodes, event listeners) — prevents z=7.2 false positives from e.g. 11→12 style recalcs - Switch from mean to **median** for PR metric aggregation — robust to outlier CI runs that dominate n=3 mean - Increase historical baseline window from **5 to 15 runs** for more stable σ estimates - Reorder reported metrics: layout/style duration first (actionable), counts and heap after (informational) ## Review Focus The effect size gate in `classifyChange()` — it now requires both z > 2 AND absolute delta ≥ `minAbsDelta` (when configured) to flag a regression. This addresses the core false positive issue where integer metrics with near-zero historical variance produce extreme z-scores for trivial changes. Median vs mean tradeoff: median is more robust to outliers but less sensitive to real shifts — acceptable given n=3 and CI noise levels. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-10477-perf-add-layout-GC-metrics-reduce-false-positives-in-regression-detection-32d6d73d365081daa72cec96d8a07b90) by [Unito](https://www.unito.io)
This commit is contained in:
2
.github/workflows/pr-report.yaml
vendored
2
.github/workflows/pr-report.yaml
vendored
@@ -180,7 +180,7 @@ jobs:
|
||||
if git ls-remote --exit-code origin perf-data >/dev/null 2>&1; then
|
||||
git fetch origin perf-data --depth=1
|
||||
mkdir -p temp/perf-history
|
||||
for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -10); do
|
||||
for file in $(git ls-tree --name-only origin/perf-data baselines/ 2>/dev/null | sort -r | head -15); do
|
||||
git show "origin/perf-data:${file}" > "temp/perf-history/$(basename "$file")" 2>/dev/null || true
|
||||
done
|
||||
echo "Loaded $(ls temp/perf-history/*.json 2>/dev/null | wc -l) historical baselines"
|
||||
|
||||
@@ -23,6 +23,7 @@ export interface PerfMeasurement {
|
||||
layoutDurationMs: number
|
||||
taskDurationMs: number
|
||||
heapDeltaBytes: number
|
||||
heapUsedBytes: number
|
||||
domNodes: number
|
||||
jsHeapTotalBytes: number
|
||||
scriptDurationMs: number
|
||||
@@ -190,6 +191,7 @@ export class PerformanceHelper {
|
||||
layoutDurationMs: delta('LayoutDuration') * 1000,
|
||||
taskDurationMs: delta('TaskDuration') * 1000,
|
||||
heapDeltaBytes: delta('JSHeapUsedSize'),
|
||||
heapUsedBytes: after.JSHeapUsedSize,
|
||||
domNodes: delta('Nodes'),
|
||||
jsHeapTotalBytes: delta('JSHeapTotalSize'),
|
||||
scriptDurationMs: delta('ScriptDuration') * 1000,
|
||||
|
||||
@@ -22,6 +22,7 @@ interface PerfMeasurement {
|
||||
layoutDurationMs: number
|
||||
taskDurationMs: number
|
||||
heapDeltaBytes: number
|
||||
heapUsedBytes: number
|
||||
domNodes: number
|
||||
jsHeapTotalBytes: number
|
||||
scriptDurationMs: number
|
||||
@@ -43,22 +44,46 @@ const HISTORY_DIR = 'temp/perf-history'
|
||||
|
||||
type MetricKey =
|
||||
| 'styleRecalcs'
|
||||
| 'styleRecalcDurationMs'
|
||||
| 'layouts'
|
||||
| 'layoutDurationMs'
|
||||
| 'taskDurationMs'
|
||||
| 'domNodes'
|
||||
| 'scriptDurationMs'
|
||||
| 'eventListeners'
|
||||
| 'totalBlockingTimeMs'
|
||||
| 'frameDurationMs'
|
||||
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
|
||||
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
|
||||
{ key: 'layouts', label: 'layouts', unit: '' },
|
||||
| 'heapUsedBytes'
|
||||
|
||||
interface MetricDef {
|
||||
key: MetricKey
|
||||
label: string
|
||||
unit: string
|
||||
/** Minimum absolute delta to consider meaningful (effect size gate) */
|
||||
minAbsDelta?: number
|
||||
}
|
||||
|
||||
const REPORTED_METRICS: MetricDef[] = [
|
||||
{ key: 'layoutDurationMs', label: 'layout duration', unit: 'ms' },
|
||||
{
|
||||
key: 'styleRecalcDurationMs',
|
||||
label: 'style recalc duration',
|
||||
unit: 'ms'
|
||||
},
|
||||
{ key: 'layouts', label: 'layout count', unit: '', minAbsDelta: 5 },
|
||||
{
|
||||
key: 'styleRecalcs',
|
||||
label: 'style recalc count',
|
||||
unit: '',
|
||||
minAbsDelta: 5
|
||||
},
|
||||
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' },
|
||||
{ key: 'domNodes', label: 'DOM nodes', unit: '' },
|
||||
{ key: 'scriptDurationMs', label: 'script duration', unit: 'ms' },
|
||||
{ key: 'eventListeners', label: 'event listeners', unit: '' },
|
||||
{ key: 'totalBlockingTimeMs', label: 'TBT', unit: 'ms' },
|
||||
{ key: 'frameDurationMs', label: 'frame duration', unit: 'ms' }
|
||||
{ key: 'frameDurationMs', label: 'frame duration', unit: 'ms' },
|
||||
{ key: 'heapUsedBytes', label: 'heap used', unit: 'bytes' },
|
||||
{ key: 'domNodes', label: 'DOM nodes', unit: '', minAbsDelta: 5 },
|
||||
{ key: 'eventListeners', label: 'event listeners', unit: '', minAbsDelta: 5 }
|
||||
]
|
||||
|
||||
function groupByName(
|
||||
@@ -134,7 +159,9 @@ function computeCV(stats: MetricStats): number {
|
||||
}
|
||||
|
||||
function formatValue(value: number, unit: string): string {
|
||||
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
|
||||
if (unit === 'ms') return `${value.toFixed(0)}ms`
|
||||
if (unit === 'bytes') return formatBytes(value)
|
||||
return `${value.toFixed(0)}`
|
||||
}
|
||||
|
||||
function formatDelta(pct: number | null): string {
|
||||
@@ -159,6 +186,21 @@ function meanMetric(samples: PerfMeasurement[], key: MetricKey): number | null {
|
||||
return values.reduce((sum, v) => sum + v, 0) / values.length
|
||||
}
|
||||
|
||||
function medianMetric(
|
||||
samples: PerfMeasurement[],
|
||||
key: MetricKey
|
||||
): number | null {
|
||||
const values = samples
|
||||
.map((s) => getMetricValue(s, key))
|
||||
.filter((v): v is number => v !== null)
|
||||
.sort((a, b) => a - b)
|
||||
if (values.length === 0) return null
|
||||
const mid = Math.floor(values.length / 2)
|
||||
return values.length % 2 === 0
|
||||
? (values[mid - 1] + values[mid]) / 2
|
||||
: values[mid]
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (Math.abs(bytes) < 1024) return `${bytes} B`
|
||||
if (Math.abs(bytes) < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
|
||||
@@ -173,7 +215,7 @@ function renderFullReport(
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
const tableHeader = [
|
||||
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
|
||||
'| Metric | Baseline | PR (median) | Δ | Sig |',
|
||||
'|--------|----------|----------|---|-----|'
|
||||
]
|
||||
|
||||
@@ -183,36 +225,38 @@ function renderFullReport(
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prMean = meanMetric(prSamples, key)
|
||||
if (prMean === null) continue
|
||||
for (const { key, label, unit, minAbsDelta } of REPORTED_METRICS) {
|
||||
// Use median for PR values — robust to outlier runs in CI
|
||||
const prVal = medianMetric(prSamples, key)
|
||||
if (prVal === null) continue
|
||||
const histStats = getHistoricalStats(historical, testName, key)
|
||||
const cv = computeCV(histStats)
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
allRows.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
|
||||
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const baseVal = medianMetric(baseSamples, key)
|
||||
if (baseVal === null) {
|
||||
allRows.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
|
||||
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new | — |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
const absDelta = prVal - baseVal
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? prVal === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
const z = zScore(prMean, histStats)
|
||||
const sig = classifyChange(z, cv)
|
||||
: ((prVal - baseVal) / baseVal) * 100
|
||||
const z = zScore(prVal, histStats)
|
||||
const sig = classifyChange(z, cv, absDelta, minAbsDelta)
|
||||
|
||||
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
|
||||
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
|
||||
allRows.push(row)
|
||||
if (isNoteworthy(sig)) {
|
||||
flaggedRows.push(row)
|
||||
@@ -299,7 +343,7 @@ function renderColdStartReport(
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
lines.push(
|
||||
`> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
|
||||
`> ℹ️ Collecting baseline variance data (${historicalCount}/15 runs). Significance will appear after 2 main branch runs.`,
|
||||
'',
|
||||
'| Metric | Baseline | PR | Δ |',
|
||||
'|--------|----------|-----|---|'
|
||||
@@ -309,31 +353,31 @@ function renderColdStartReport(
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prMean = meanMetric(prSamples, key)
|
||||
if (prMean === null) continue
|
||||
const prVal = medianMetric(prSamples, key)
|
||||
if (prVal === null) continue
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
|
||||
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const baseVal = medianMetric(baseSamples, key)
|
||||
if (baseVal === null) {
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
|
||||
`| ${testName}: ${label} | — | ${formatValue(prVal, unit)} | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? prVal === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
: ((prVal - baseVal) / baseVal) * 100
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
|
||||
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prVal, unit)} | ${formatDelta(deltaPct)} |`
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -352,14 +396,10 @@ function renderNoBaselineReport(
|
||||
)
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prMean = meanMetric(prSamples, key)
|
||||
if (prMean === null) continue
|
||||
lines.push(`| ${testName}: ${label} | ${formatValue(prMean, unit)} |`)
|
||||
const prVal = medianMetric(prSamples, key)
|
||||
if (prVal === null) continue
|
||||
lines.push(`| ${testName}: ${label} | ${formatValue(prVal, unit)} |`)
|
||||
}
|
||||
const heapMean =
|
||||
prSamples.reduce((sum, s) => sum + (s.heapDeltaBytes ?? 0), 0) /
|
||||
prSamples.length
|
||||
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
@@ -99,6 +99,21 @@ describe('classifyChange', () => {
|
||||
expect(classifyChange(2, 10)).toBe('neutral')
|
||||
expect(classifyChange(-2, 10)).toBe('neutral')
|
||||
})
|
||||
|
||||
it('returns neutral when absDelta below minAbsDelta despite high z', () => {
|
||||
// z=7.2 but only 1 unit change with minAbsDelta=5
|
||||
expect(classifyChange(7.2, 10, 1, 5)).toBe('neutral')
|
||||
expect(classifyChange(-7.2, 10, -1, 5)).toBe('neutral')
|
||||
})
|
||||
|
||||
it('returns regression when absDelta meets minAbsDelta', () => {
|
||||
expect(classifyChange(3, 10, 10, 5)).toBe('regression')
|
||||
})
|
||||
|
||||
it('ignores effect size gate when minAbsDelta not provided', () => {
|
||||
expect(classifyChange(3, 10)).toBe('regression')
|
||||
expect(classifyChange(3, 10, 1)).toBe('regression')
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatSignificance', () => {
|
||||
|
||||
@@ -31,12 +31,28 @@ export function zScore(value: number, stats: MetricStats): number | null {
|
||||
|
||||
export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
|
||||
|
||||
/**
|
||||
* Classify a metric change as regression/improvement/neutral/noisy.
|
||||
*
|
||||
* Uses both statistical significance (z-score) and practical significance
|
||||
* (effect size gate via minAbsDelta) to reduce false positives from
|
||||
* integer-quantized metrics with near-zero variance.
|
||||
*/
|
||||
export function classifyChange(
|
||||
z: number | null,
|
||||
historicalCV: number
|
||||
historicalCV: number,
|
||||
absDelta?: number,
|
||||
minAbsDelta?: number
|
||||
): Significance {
|
||||
if (historicalCV > 50) return 'noisy'
|
||||
if (z === null) return 'neutral'
|
||||
|
||||
// Effect size gate: require minimum absolute change for count metrics
|
||||
// to avoid flagging e.g. 11→12 style recalcs as z=7.2 regression.
|
||||
if (minAbsDelta !== undefined && absDelta !== undefined) {
|
||||
if (Math.abs(absDelta) < minAbsDelta) return 'neutral'
|
||||
}
|
||||
|
||||
if (z > 2) return 'regression'
|
||||
if (z < -2) return 'improvement'
|
||||
return 'neutral'
|
||||
|
||||
Reference in New Issue
Block a user