mirror of
https://github.com/Comfy-Org/ComfyUI_frontend.git
synced 2026-04-22 07:19:41 +00:00
feat: add statistical significance to perf report with z-score thresholds (#9305)
## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io)
This commit is contained in:
@@ -1,4 +1,14 @@
|
||||
import { existsSync, readFileSync } from 'node:fs'
|
||||
import { existsSync, readFileSync, readdirSync } from 'node:fs'
|
||||
import { join } from 'node:path'
|
||||
|
||||
import type { MetricStats } from './perf-stats'
|
||||
import {
|
||||
classifyChange,
|
||||
computeStats,
|
||||
formatSignificance,
|
||||
isNoteworthy,
|
||||
zScore
|
||||
} from './perf-stats'
|
||||
|
||||
interface PerfMeasurement {
|
||||
name: string
|
||||
@@ -20,12 +30,76 @@ interface PerfReport {
|
||||
|
||||
const CURRENT_PATH = 'test-results/perf-metrics.json'
|
||||
const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
|
||||
const HISTORY_DIR = 'temp/perf-history'
|
||||
|
||||
function formatDelta(pct: number): string {
|
||||
if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
|
||||
if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
|
||||
if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
|
||||
return `${pct.toFixed(0)}% 🟢`
|
||||
type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
|
||||
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
|
||||
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
|
||||
{ key: 'layouts', label: 'layouts', unit: '' },
|
||||
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
|
||||
]
|
||||
|
||||
function groupByName(
|
||||
measurements: PerfMeasurement[]
|
||||
): Map<string, PerfMeasurement[]> {
|
||||
const map = new Map<string, PerfMeasurement[]>()
|
||||
for (const m of measurements) {
|
||||
const list = map.get(m.name) ?? []
|
||||
list.push(m)
|
||||
map.set(m.name, list)
|
||||
}
|
||||
return map
|
||||
}
|
||||
|
||||
function loadHistoricalReports(): PerfReport[] {
|
||||
if (!existsSync(HISTORY_DIR)) return []
|
||||
const reports: PerfReport[] = []
|
||||
for (const dir of readdirSync(HISTORY_DIR)) {
|
||||
const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
|
||||
if (!existsSync(filePath)) continue
|
||||
try {
|
||||
reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
|
||||
} catch {
|
||||
console.warn(`Skipping malformed perf history: ${filePath}`)
|
||||
}
|
||||
}
|
||||
return reports
|
||||
}
|
||||
|
||||
function getHistoricalStats(
|
||||
reports: PerfReport[],
|
||||
testName: string,
|
||||
metric: MetricKey
|
||||
): MetricStats {
|
||||
const values: number[] = []
|
||||
for (const r of reports) {
|
||||
const group = groupByName(r.measurements)
|
||||
const samples = group.get(testName)
|
||||
if (samples) {
|
||||
const mean =
|
||||
samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
|
||||
values.push(mean)
|
||||
}
|
||||
}
|
||||
return computeStats(values)
|
||||
}
|
||||
|
||||
function computeCV(stats: MetricStats): number {
|
||||
return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
|
||||
}
|
||||
|
||||
function formatValue(value: number, unit: string): string {
|
||||
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
|
||||
}
|
||||
|
||||
function formatDelta(pct: number | null): string {
|
||||
if (pct === null) return '—'
|
||||
const sign = pct >= 0 ? '+' : ''
|
||||
return `${sign}${pct.toFixed(0)}%`
|
||||
}
|
||||
|
||||
function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
|
||||
return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||
}
|
||||
|
||||
function calcDelta(
|
||||
baseline: number,
|
||||
current: number
|
||||
): { pct: number; isNew: boolean } {
|
||||
if (baseline > 0) {
|
||||
return { pct: ((current - baseline) / baseline) * 100, isNew: false }
|
||||
function renderFullReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>,
|
||||
baseline: PerfReport,
|
||||
historical: PerfReport[]
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
const tableHeader = [
|
||||
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
|
||||
'|--------|----------|----------|---|-----|'
|
||||
]
|
||||
|
||||
const flaggedRows: string[] = []
|
||||
const allRows: string[] = []
|
||||
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prValues = prSamples.map((s) => s[key])
|
||||
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
|
||||
const histStats = getHistoricalStats(historical, testName, key)
|
||||
const cv = computeCV(histStats)
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
allRows.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
const z = zScore(prMean, histStats)
|
||||
const sig = classifyChange(z, cv)
|
||||
|
||||
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
|
||||
allRows.push(row)
|
||||
if (isNoteworthy(sig)) {
|
||||
flaggedRows.push(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
|
||||
|
||||
if (flaggedRows.length > 0) {
|
||||
lines.push(
|
||||
`⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
|
||||
'',
|
||||
...tableHeader,
|
||||
...flaggedRows,
|
||||
''
|
||||
)
|
||||
} else {
|
||||
lines.push('No regressions detected.', '')
|
||||
}
|
||||
|
||||
lines.push(
|
||||
`<details><summary>All metrics</summary>`,
|
||||
'',
|
||||
...tableHeader,
|
||||
...allRows,
|
||||
'',
|
||||
'</details>',
|
||||
''
|
||||
)
|
||||
|
||||
lines.push(
|
||||
`<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
|
||||
'',
|
||||
'| Metric | μ | σ | CV |',
|
||||
'|--------|---|---|-----|'
|
||||
)
|
||||
for (const [testName] of prGroups) {
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const stats = getHistoricalStats(historical, testName, key)
|
||||
if (stats.n < 2) continue
|
||||
const cv = computeCV(stats)
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
|
||||
)
|
||||
}
|
||||
}
|
||||
lines.push('', '</details>')
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
|
||||
return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
|
||||
function renderColdStartReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>,
|
||||
baseline: PerfReport,
|
||||
historicalCount: number
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
lines.push(
|
||||
`> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
|
||||
'',
|
||||
'| Metric | Baseline | PR | Δ |',
|
||||
'|--------|----------|-----|---|'
|
||||
)
|
||||
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prValues = prSamples.map((s) => s[key])
|
||||
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
function renderNoBaselineReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
lines.push(
|
||||
'No baseline found — showing absolute values.\n',
|
||||
'| Metric | Value |',
|
||||
'|--------|-------|'
|
||||
)
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const prMean = (key: MetricKey) =>
|
||||
prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
|
||||
|
||||
lines.push(
|
||||
`| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
|
||||
)
|
||||
lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
|
||||
lines.push(
|
||||
`| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
|
||||
)
|
||||
const heapMean =
|
||||
prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
|
||||
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -62,55 +285,18 @@ function main() {
|
||||
? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
|
||||
: null
|
||||
|
||||
const historical = loadHistoricalReports()
|
||||
const prGroups = groupByName(current.measurements)
|
||||
|
||||
const lines: string[] = []
|
||||
lines.push('## ⚡ Performance Report\n')
|
||||
|
||||
if (baseline) {
|
||||
lines.push(
|
||||
'| Metric | Baseline | PR | Δ |',
|
||||
'|--------|----------|-----|---|'
|
||||
)
|
||||
|
||||
for (const m of current.measurements) {
|
||||
const base = baseline.measurements.find((b) => b.name === m.name)
|
||||
if (!base) {
|
||||
lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
|
||||
lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
|
||||
lines.push(
|
||||
`| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
|
||||
)
|
||||
|
||||
const layoutDelta = calcDelta(base.layouts, m.layouts)
|
||||
lines.push(
|
||||
`| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
|
||||
)
|
||||
|
||||
const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
|
||||
)
|
||||
}
|
||||
if (baseline && historical.length >= 2) {
|
||||
lines.push(...renderFullReport(prGroups, baseline, historical))
|
||||
} else if (baseline) {
|
||||
lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
|
||||
} else {
|
||||
lines.push(
|
||||
'No baseline found — showing absolute values.\n',
|
||||
'| Metric | Value |',
|
||||
'|--------|-------|'
|
||||
)
|
||||
for (const m of current.measurements) {
|
||||
lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
|
||||
lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
|
||||
)
|
||||
lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
|
||||
}
|
||||
lines.push(...renderNoBaselineReport(prGroups))
|
||||
}
|
||||
|
||||
lines.push('\n<details><summary>Raw data</summary>\n')
|
||||
|
||||
Reference in New Issue
Block a user