mirror of
https://github.com/Comfy-Org/ComfyUI_frontend.git
synced 2026-03-08 06:30:04 +00:00
## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io)
64 lines
1.5 KiB
TypeScript
64 lines
1.5 KiB
TypeScript
export interface MetricStats {
|
|
mean: number
|
|
stddev: number
|
|
min: number
|
|
max: number
|
|
n: number
|
|
}
|
|
|
|
export function computeStats(values: number[]): MetricStats {
|
|
const n = values.length
|
|
if (n === 0) return { mean: 0, stddev: 0, min: 0, max: 0, n: 0 }
|
|
if (n === 1)
|
|
return { mean: values[0], stddev: 0, min: values[0], max: values[0], n: 1 }
|
|
|
|
const mean = values.reduce((a, b) => a + b, 0) / n
|
|
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1)
|
|
|
|
return {
|
|
mean,
|
|
stddev: Math.sqrt(variance),
|
|
min: Math.min(...values),
|
|
max: Math.max(...values),
|
|
n
|
|
}
|
|
}
|
|
|
|
export function zScore(value: number, stats: MetricStats): number | null {
|
|
if (stats.stddev === 0 || stats.n < 2) return null
|
|
return (value - stats.mean) / stats.stddev
|
|
}
|
|
|
|
export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
|
|
|
|
export function classifyChange(
|
|
z: number | null,
|
|
historicalCV: number
|
|
): Significance {
|
|
if (historicalCV > 50) return 'noisy'
|
|
if (z === null) return 'neutral'
|
|
if (z > 2) return 'regression'
|
|
if (z < -2) return 'improvement'
|
|
return 'neutral'
|
|
}
|
|
|
|
export function formatSignificance(
|
|
sig: Significance,
|
|
z: number | null
|
|
): string {
|
|
switch (sig) {
|
|
case 'regression':
|
|
return `⚠️ z=${z!.toFixed(1)}`
|
|
case 'improvement':
|
|
return `z=${z!.toFixed(1)}`
|
|
case 'noisy':
|
|
return 'variance too high'
|
|
case 'neutral':
|
|
return z !== null ? `z=${z.toFixed(1)}` : '—'
|
|
}
|
|
}
|
|
|
|
export function isNoteworthy(sig: Significance): boolean {
|
|
return sig === 'regression'
|
|
}
|