mirror of
https://github.com/Comfy-Org/ComfyUI_frontend.git
synced 2026-03-07 22:20:03 +00:00
feat: add statistical significance to perf report with z-score thresholds (#9305)
## Summary Replace fixed 10%/20% perf delta thresholds with dynamic σ-based classification using z-scores, eliminating false alarms from naturally noisy duration metrics (10-17% CV). ## Changes - **What**: - Run each perf test 3× (`--repeat-each=3`) and report the mean, reducing single-run noise - Download last 5 successful main branch perf artifacts to compute historical μ/σ per metric - Replace fixed threshold flags with z-score significance: `⚠️ regression` (z>2), `✅ neutral/improvement`, `🔇 noisy` (CV>50%) - Add collapsible historical variance table (μ, σ, CV) to PR comment - Graceful cold start: falls back to simple delta table until ≥2 historical runs exist - New `scripts/perf-stats.ts` module with `computeStats`, `zScore`, `classifyChange` - 18 unit tests for stats functions - **CI time impact**: ~3 min → ~5-6 min (repeat-each adds ~2 min, historical download <10s) ## Review Focus - The `gh api` call in the new "Download historical perf baselines" step: it queries the last 5 successful push runs on the base branch. The `gh` CLI is available natively on `ubuntu-latest` runners and auto-authenticates with `GITHUB_TOKEN`. - `getHistoricalStats` averages per-run measurements before computing cross-run σ — this is intentional since historical artifacts may also contain repeated measurements after this change lands. - The `noisy` classification (CV>50%) suppresses metrics like `layouts` that hover near 0 and have meaningless percentage swings. ┆Issue is synchronized with this [Notion page](https://www.notion.so/PR-9305-feat-add-statistical-significance-to-perf-report-with-z-score-thresholds-3156d73d3650818d9360eeafd9ae7dc1) by [Unito](https://www.unito.io)
This commit is contained in:
28
.github/workflows/ci-perf-report.yaml
vendored
28
.github/workflows/ci-perf-report.yaml
vendored
@@ -45,7 +45,7 @@ jobs:
|
||||
- name: Run performance tests
|
||||
id: perf
|
||||
continue-on-error: true
|
||||
run: pnpm exec playwright test --project=performance --workers=1
|
||||
run: pnpm exec playwright test --project=performance --workers=1 --repeat-each=3
|
||||
|
||||
- name: Upload perf metrics
|
||||
if: always()
|
||||
@@ -61,6 +61,7 @@ jobs:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
@@ -90,6 +91,31 @@ jobs:
|
||||
path: temp/perf-baseline/
|
||||
if_no_artifact_found: warn
|
||||
|
||||
- name: Download historical perf baselines
|
||||
continue-on-error: true
|
||||
run: |
|
||||
RUNS=$(gh api \
|
||||
"/repos/${{ github.repository }}/actions/workflows/ci-perf-report.yaml/runs?branch=${{ github.event.pull_request.base.ref }}&event=push&status=success&per_page=5" \
|
||||
--jq '.workflow_runs[].id' || true)
|
||||
|
||||
if [ -z "$RUNS" ]; then
|
||||
echo "No historical runs available"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p temp/perf-history
|
||||
INDEX=0
|
||||
for RUN_ID in $RUNS; do
|
||||
DIR="temp/perf-history/$INDEX"
|
||||
mkdir -p "$DIR"
|
||||
gh run download "$RUN_ID" -n perf-metrics -D "$DIR/" 2>/dev/null || true
|
||||
INDEX=$((INDEX + 1))
|
||||
done
|
||||
|
||||
echo "Downloaded $(ls temp/perf-history/*/perf-metrics.json 2>/dev/null | wc -l) historical baselines"
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Generate perf report
|
||||
run: npx --yes tsx scripts/perf-report.ts > perf-report.md
|
||||
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
import { existsSync, readFileSync } from 'node:fs'
|
||||
import { existsSync, readFileSync, readdirSync } from 'node:fs'
|
||||
import { join } from 'node:path'
|
||||
|
||||
import type { MetricStats } from './perf-stats'
|
||||
import {
|
||||
classifyChange,
|
||||
computeStats,
|
||||
formatSignificance,
|
||||
isNoteworthy,
|
||||
zScore
|
||||
} from './perf-stats'
|
||||
|
||||
interface PerfMeasurement {
|
||||
name: string
|
||||
@@ -20,12 +30,76 @@ interface PerfReport {
|
||||
|
||||
const CURRENT_PATH = 'test-results/perf-metrics.json'
|
||||
const BASELINE_PATH = 'temp/perf-baseline/perf-metrics.json'
|
||||
const HISTORY_DIR = 'temp/perf-history'
|
||||
|
||||
function formatDelta(pct: number): string {
|
||||
if (pct >= 20) return `+${pct.toFixed(0)}% 🔴`
|
||||
if (pct >= 10) return `+${pct.toFixed(0)}% 🟠`
|
||||
if (pct > -10) return `${pct >= 0 ? '+' : ''}${pct.toFixed(0)}% ⚪`
|
||||
return `${pct.toFixed(0)}% 🟢`
|
||||
type MetricKey = 'styleRecalcs' | 'layouts' | 'taskDurationMs'
|
||||
const REPORTED_METRICS: { key: MetricKey; label: string; unit: string }[] = [
|
||||
{ key: 'styleRecalcs', label: 'style recalcs', unit: '' },
|
||||
{ key: 'layouts', label: 'layouts', unit: '' },
|
||||
{ key: 'taskDurationMs', label: 'task duration', unit: 'ms' }
|
||||
]
|
||||
|
||||
function groupByName(
|
||||
measurements: PerfMeasurement[]
|
||||
): Map<string, PerfMeasurement[]> {
|
||||
const map = new Map<string, PerfMeasurement[]>()
|
||||
for (const m of measurements) {
|
||||
const list = map.get(m.name) ?? []
|
||||
list.push(m)
|
||||
map.set(m.name, list)
|
||||
}
|
||||
return map
|
||||
}
|
||||
|
||||
function loadHistoricalReports(): PerfReport[] {
|
||||
if (!existsSync(HISTORY_DIR)) return []
|
||||
const reports: PerfReport[] = []
|
||||
for (const dir of readdirSync(HISTORY_DIR)) {
|
||||
const filePath = join(HISTORY_DIR, dir, 'perf-metrics.json')
|
||||
if (!existsSync(filePath)) continue
|
||||
try {
|
||||
reports.push(JSON.parse(readFileSync(filePath, 'utf-8')) as PerfReport)
|
||||
} catch {
|
||||
console.warn(`Skipping malformed perf history: ${filePath}`)
|
||||
}
|
||||
}
|
||||
return reports
|
||||
}
|
||||
|
||||
function getHistoricalStats(
|
||||
reports: PerfReport[],
|
||||
testName: string,
|
||||
metric: MetricKey
|
||||
): MetricStats {
|
||||
const values: number[] = []
|
||||
for (const r of reports) {
|
||||
const group = groupByName(r.measurements)
|
||||
const samples = group.get(testName)
|
||||
if (samples) {
|
||||
const mean =
|
||||
samples.reduce((sum, s) => sum + s[metric], 0) / samples.length
|
||||
values.push(mean)
|
||||
}
|
||||
}
|
||||
return computeStats(values)
|
||||
}
|
||||
|
||||
function computeCV(stats: MetricStats): number {
|
||||
return stats.mean > 0 ? (stats.stddev / stats.mean) * 100 : 0
|
||||
}
|
||||
|
||||
function formatValue(value: number, unit: string): string {
|
||||
return unit === 'ms' ? `${value.toFixed(0)}ms` : `${value.toFixed(0)}`
|
||||
}
|
||||
|
||||
function formatDelta(pct: number | null): string {
|
||||
if (pct === null) return '—'
|
||||
const sign = pct >= 0 ? '+' : ''
|
||||
return `${sign}${pct.toFixed(0)}%`
|
||||
}
|
||||
|
||||
function meanMetric(samples: PerfMeasurement[], key: MetricKey): number {
|
||||
return samples.reduce((sum, s) => sum + s[key], 0) / samples.length
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
@@ -34,18 +108,167 @@ function formatBytes(bytes: number): string {
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||
}
|
||||
|
||||
function calcDelta(
|
||||
baseline: number,
|
||||
current: number
|
||||
): { pct: number; isNew: boolean } {
|
||||
if (baseline > 0) {
|
||||
return { pct: ((current - baseline) / baseline) * 100, isNew: false }
|
||||
function renderFullReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>,
|
||||
baseline: PerfReport,
|
||||
historical: PerfReport[]
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
const tableHeader = [
|
||||
'| Metric | Baseline | PR (n=3) | Δ | Sig |',
|
||||
'|--------|----------|----------|---|-----|'
|
||||
]
|
||||
|
||||
const flaggedRows: string[] = []
|
||||
const allRows: string[] = []
|
||||
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prValues = prSamples.map((s) => s[key])
|
||||
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
|
||||
const histStats = getHistoricalStats(historical, testName, key)
|
||||
const cv = computeCV(histStats)
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
allRows.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new | — |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
const z = zScore(prMean, histStats)
|
||||
const sig = classifyChange(z, cv)
|
||||
|
||||
const row = `| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} | ${formatSignificance(sig, z)} |`
|
||||
allRows.push(row)
|
||||
if (isNoteworthy(sig)) {
|
||||
flaggedRows.push(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
return current > 0 ? { pct: Infinity, isNew: true } : { pct: 0, isNew: false }
|
||||
|
||||
if (flaggedRows.length > 0) {
|
||||
lines.push(
|
||||
`⚠️ **${flaggedRows.length} regression${flaggedRows.length > 1 ? 's' : ''} detected**`,
|
||||
'',
|
||||
...tableHeader,
|
||||
...flaggedRows,
|
||||
''
|
||||
)
|
||||
} else {
|
||||
lines.push('No regressions detected.', '')
|
||||
}
|
||||
|
||||
lines.push(
|
||||
`<details><summary>All metrics</summary>`,
|
||||
'',
|
||||
...tableHeader,
|
||||
...allRows,
|
||||
'',
|
||||
'</details>',
|
||||
''
|
||||
)
|
||||
|
||||
lines.push(
|
||||
`<details><summary>Historical variance (last ${historical.length} runs)</summary>`,
|
||||
'',
|
||||
'| Metric | μ | σ | CV |',
|
||||
'|--------|---|---|-----|'
|
||||
)
|
||||
for (const [testName] of prGroups) {
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const stats = getHistoricalStats(historical, testName, key)
|
||||
if (stats.n < 2) continue
|
||||
const cv = computeCV(stats)
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | ${formatValue(stats.mean, unit)} | ${formatValue(stats.stddev, unit)} | ${cv.toFixed(1)}% |`
|
||||
)
|
||||
}
|
||||
}
|
||||
lines.push('', '</details>')
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
function formatDeltaCell(delta: { pct: number; isNew: boolean }): string {
|
||||
return delta.isNew ? 'new 🔴' : formatDelta(delta.pct)
|
||||
function renderColdStartReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>,
|
||||
baseline: PerfReport,
|
||||
historicalCount: number
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
const baselineGroups = groupByName(baseline.measurements)
|
||||
lines.push(
|
||||
`> ℹ️ Collecting baseline variance data (${historicalCount}/5 runs). Significance will appear after 2 main branch runs.`,
|
||||
'',
|
||||
'| Metric | Baseline | PR | Δ |',
|
||||
'|--------|----------|-----|---|'
|
||||
)
|
||||
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const baseSamples = baselineGroups.get(testName)
|
||||
|
||||
for (const { key, label, unit } of REPORTED_METRICS) {
|
||||
const prValues = prSamples.map((s) => s[key])
|
||||
const prMean = prValues.reduce((a, b) => a + b, 0) / prValues.length
|
||||
|
||||
if (!baseSamples?.length) {
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | — | ${formatValue(prMean, unit)} | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const baseVal = meanMetric(baseSamples, key)
|
||||
const deltaPct =
|
||||
baseVal === 0
|
||||
? prMean === 0
|
||||
? 0
|
||||
: null
|
||||
: ((prMean - baseVal) / baseVal) * 100
|
||||
lines.push(
|
||||
`| ${testName}: ${label} | ${formatValue(baseVal, unit)} | ${formatValue(prMean, unit)} | ${formatDelta(deltaPct)} |`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
function renderNoBaselineReport(
|
||||
prGroups: Map<string, PerfMeasurement[]>
|
||||
): string[] {
|
||||
const lines: string[] = []
|
||||
lines.push(
|
||||
'No baseline found — showing absolute values.\n',
|
||||
'| Metric | Value |',
|
||||
'|--------|-------|'
|
||||
)
|
||||
for (const [testName, prSamples] of prGroups) {
|
||||
const prMean = (key: MetricKey) =>
|
||||
prSamples.reduce((sum, s) => sum + s[key], 0) / prSamples.length
|
||||
|
||||
lines.push(
|
||||
`| ${testName}: style recalcs | ${prMean('styleRecalcs').toFixed(0)} |`
|
||||
)
|
||||
lines.push(`| ${testName}: layouts | ${prMean('layouts').toFixed(0)} |`)
|
||||
lines.push(
|
||||
`| ${testName}: task duration | ${prMean('taskDurationMs').toFixed(0)}ms |`
|
||||
)
|
||||
const heapMean =
|
||||
prSamples.reduce((sum, s) => sum + s.heapDeltaBytes, 0) / prSamples.length
|
||||
lines.push(`| ${testName}: heap delta | ${formatBytes(heapMean)} |`)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
function main() {
|
||||
@@ -62,55 +285,18 @@ function main() {
|
||||
? JSON.parse(readFileSync(BASELINE_PATH, 'utf-8'))
|
||||
: null
|
||||
|
||||
const historical = loadHistoricalReports()
|
||||
const prGroups = groupByName(current.measurements)
|
||||
|
||||
const lines: string[] = []
|
||||
lines.push('## ⚡ Performance Report\n')
|
||||
|
||||
if (baseline) {
|
||||
lines.push(
|
||||
'| Metric | Baseline | PR | Δ |',
|
||||
'|--------|----------|-----|---|'
|
||||
)
|
||||
|
||||
for (const m of current.measurements) {
|
||||
const base = baseline.measurements.find((b) => b.name === m.name)
|
||||
if (!base) {
|
||||
lines.push(`| ${m.name}: style recalcs | — | ${m.styleRecalcs} | new |`)
|
||||
lines.push(`| ${m.name}: layouts | — | ${m.layouts} | new |`)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | — | ${m.taskDurationMs.toFixed(0)}ms | new |`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
const recalcDelta = calcDelta(base.styleRecalcs, m.styleRecalcs)
|
||||
lines.push(
|
||||
`| ${m.name}: style recalcs | ${base.styleRecalcs} | ${m.styleRecalcs} | ${formatDeltaCell(recalcDelta)} |`
|
||||
)
|
||||
|
||||
const layoutDelta = calcDelta(base.layouts, m.layouts)
|
||||
lines.push(
|
||||
`| ${m.name}: layouts | ${base.layouts} | ${m.layouts} | ${formatDeltaCell(layoutDelta)} |`
|
||||
)
|
||||
|
||||
const taskDelta = calcDelta(base.taskDurationMs, m.taskDurationMs)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | ${base.taskDurationMs.toFixed(0)}ms | ${m.taskDurationMs.toFixed(0)}ms | ${formatDeltaCell(taskDelta)} |`
|
||||
)
|
||||
}
|
||||
if (baseline && historical.length >= 2) {
|
||||
lines.push(...renderFullReport(prGroups, baseline, historical))
|
||||
} else if (baseline) {
|
||||
lines.push(...renderColdStartReport(prGroups, baseline, historical.length))
|
||||
} else {
|
||||
lines.push(
|
||||
'No baseline found — showing absolute values.\n',
|
||||
'| Metric | Value |',
|
||||
'|--------|-------|'
|
||||
)
|
||||
for (const m of current.measurements) {
|
||||
lines.push(`| ${m.name}: style recalcs | ${m.styleRecalcs} |`)
|
||||
lines.push(`| ${m.name}: layouts | ${m.layouts} |`)
|
||||
lines.push(
|
||||
`| ${m.name}: task duration | ${m.taskDurationMs.toFixed(0)}ms |`
|
||||
)
|
||||
lines.push(`| ${m.name}: heap delta | ${formatBytes(m.heapDeltaBytes)} |`)
|
||||
}
|
||||
lines.push(...renderNoBaselineReport(prGroups))
|
||||
}
|
||||
|
||||
lines.push('\n<details><summary>Raw data</summary>\n')
|
||||
|
||||
133
scripts/perf-stats.test.ts
Normal file
133
scripts/perf-stats.test.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import {
|
||||
classifyChange,
|
||||
computeStats,
|
||||
formatSignificance,
|
||||
isNoteworthy,
|
||||
zScore
|
||||
} from './perf-stats'
|
||||
|
||||
describe('computeStats', () => {
|
||||
it('returns zeros for empty array', () => {
|
||||
const stats = computeStats([])
|
||||
expect(stats).toEqual({ mean: 0, stddev: 0, min: 0, max: 0, n: 0 })
|
||||
})
|
||||
|
||||
it('returns value with zero stddev for single element', () => {
|
||||
const stats = computeStats([42])
|
||||
expect(stats).toEqual({ mean: 42, stddev: 0, min: 42, max: 42, n: 1 })
|
||||
})
|
||||
|
||||
it('computes correct stats for known values', () => {
|
||||
// Values: [2, 4, 4, 4, 5, 5, 7, 9]
|
||||
// Mean = 5, sample variance ≈ 4.57, sample stddev ≈ 2.14
|
||||
const stats = computeStats([2, 4, 4, 4, 5, 5, 7, 9])
|
||||
expect(stats.mean).toBe(5)
|
||||
expect(stats.stddev).toBeCloseTo(2.138, 2)
|
||||
expect(stats.min).toBe(2)
|
||||
expect(stats.max).toBe(9)
|
||||
expect(stats.n).toBe(8)
|
||||
})
|
||||
|
||||
it('uses sample stddev (n-1 denominator)', () => {
|
||||
// [10, 20] → mean=15, variance=(25+25)/1=50, stddev≈7.07
|
||||
const stats = computeStats([10, 20])
|
||||
expect(stats.mean).toBe(15)
|
||||
expect(stats.stddev).toBeCloseTo(7.071, 2)
|
||||
expect(stats.n).toBe(2)
|
||||
})
|
||||
|
||||
it('handles identical values', () => {
|
||||
const stats = computeStats([5, 5, 5, 5])
|
||||
expect(stats.mean).toBe(5)
|
||||
expect(stats.stddev).toBe(0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('zScore', () => {
|
||||
it('returns null when stddev is 0', () => {
|
||||
const stats = computeStats([5, 5, 5])
|
||||
expect(zScore(10, stats)).toBeNull()
|
||||
})
|
||||
|
||||
it('returns null when n < 2', () => {
|
||||
const stats = computeStats([5])
|
||||
expect(zScore(10, stats)).toBeNull()
|
||||
})
|
||||
|
||||
it('computes correct z-score', () => {
|
||||
const stats = { mean: 100, stddev: 10, min: 80, max: 120, n: 5 }
|
||||
expect(zScore(120, stats)).toBe(2)
|
||||
expect(zScore(80, stats)).toBe(-2)
|
||||
expect(zScore(100, stats)).toBe(0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('classifyChange', () => {
|
||||
it('returns noisy when CV > 50%', () => {
|
||||
expect(classifyChange(3, 60)).toBe('noisy')
|
||||
expect(classifyChange(-3, 51)).toBe('noisy')
|
||||
})
|
||||
|
||||
it('does not classify as noisy when CV is exactly 50%', () => {
|
||||
expect(classifyChange(3, 50)).toBe('regression')
|
||||
expect(classifyChange(-3, 50)).toBe('improvement')
|
||||
})
|
||||
|
||||
it('returns neutral when z is null', () => {
|
||||
expect(classifyChange(null, 10)).toBe('neutral')
|
||||
})
|
||||
|
||||
it('returns regression when z > 2', () => {
|
||||
expect(classifyChange(2.1, 10)).toBe('regression')
|
||||
expect(classifyChange(5, 10)).toBe('regression')
|
||||
})
|
||||
|
||||
it('returns improvement when z < -2', () => {
|
||||
expect(classifyChange(-2.1, 10)).toBe('improvement')
|
||||
expect(classifyChange(-5, 10)).toBe('improvement')
|
||||
})
|
||||
|
||||
it('returns neutral when z is within [-2, 2]', () => {
|
||||
expect(classifyChange(0, 10)).toBe('neutral')
|
||||
expect(classifyChange(1.9, 10)).toBe('neutral')
|
||||
expect(classifyChange(-1.9, 10)).toBe('neutral')
|
||||
expect(classifyChange(2, 10)).toBe('neutral')
|
||||
expect(classifyChange(-2, 10)).toBe('neutral')
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatSignificance', () => {
|
||||
it('formats regression with z-score and emoji', () => {
|
||||
expect(formatSignificance('regression', 3.2)).toBe('⚠️ z=3.2')
|
||||
})
|
||||
|
||||
it('formats improvement with z-score without emoji', () => {
|
||||
expect(formatSignificance('improvement', -2.5)).toBe('z=-2.5')
|
||||
})
|
||||
|
||||
it('formats noisy as descriptive text', () => {
|
||||
expect(formatSignificance('noisy', null)).toBe('variance too high')
|
||||
})
|
||||
|
||||
it('formats neutral with z-score without emoji', () => {
|
||||
expect(formatSignificance('neutral', 0.5)).toBe('z=0.5')
|
||||
})
|
||||
|
||||
it('formats neutral without z-score as dash', () => {
|
||||
expect(formatSignificance('neutral', null)).toBe('—')
|
||||
})
|
||||
})
|
||||
|
||||
describe('isNoteworthy', () => {
|
||||
it('returns true for regressions', () => {
|
||||
expect(isNoteworthy('regression')).toBe(true)
|
||||
})
|
||||
|
||||
it('returns false for non-regressions', () => {
|
||||
expect(isNoteworthy('improvement')).toBe(false)
|
||||
expect(isNoteworthy('neutral')).toBe(false)
|
||||
expect(isNoteworthy('noisy')).toBe(false)
|
||||
})
|
||||
})
|
||||
63
scripts/perf-stats.ts
Normal file
63
scripts/perf-stats.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
export interface MetricStats {
|
||||
mean: number
|
||||
stddev: number
|
||||
min: number
|
||||
max: number
|
||||
n: number
|
||||
}
|
||||
|
||||
export function computeStats(values: number[]): MetricStats {
|
||||
const n = values.length
|
||||
if (n === 0) return { mean: 0, stddev: 0, min: 0, max: 0, n: 0 }
|
||||
if (n === 1)
|
||||
return { mean: values[0], stddev: 0, min: values[0], max: values[0], n: 1 }
|
||||
|
||||
const mean = values.reduce((a, b) => a + b, 0) / n
|
||||
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1)
|
||||
|
||||
return {
|
||||
mean,
|
||||
stddev: Math.sqrt(variance),
|
||||
min: Math.min(...values),
|
||||
max: Math.max(...values),
|
||||
n
|
||||
}
|
||||
}
|
||||
|
||||
export function zScore(value: number, stats: MetricStats): number | null {
|
||||
if (stats.stddev === 0 || stats.n < 2) return null
|
||||
return (value - stats.mean) / stats.stddev
|
||||
}
|
||||
|
||||
export type Significance = 'regression' | 'improvement' | 'neutral' | 'noisy'
|
||||
|
||||
export function classifyChange(
|
||||
z: number | null,
|
||||
historicalCV: number
|
||||
): Significance {
|
||||
if (historicalCV > 50) return 'noisy'
|
||||
if (z === null) return 'neutral'
|
||||
if (z > 2) return 'regression'
|
||||
if (z < -2) return 'improvement'
|
||||
return 'neutral'
|
||||
}
|
||||
|
||||
export function formatSignificance(
|
||||
sig: Significance,
|
||||
z: number | null
|
||||
): string {
|
||||
switch (sig) {
|
||||
case 'regression':
|
||||
return `⚠️ z=${z!.toFixed(1)}`
|
||||
case 'improvement':
|
||||
return `z=${z!.toFixed(1)}`
|
||||
case 'noisy':
|
||||
return 'variance too high'
|
||||
case 'neutral':
|
||||
return z !== null ? `z=${z.toFixed(1)}` : '—'
|
||||
}
|
||||
}
|
||||
|
||||
export function isNoteworthy(sig: Significance): boolean {
|
||||
return sig === 'regression'
|
||||
}
|
||||
@@ -618,7 +618,8 @@ export default defineConfig({
|
||||
retry: process.env.CI ? 2 : 0,
|
||||
include: [
|
||||
'src/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
|
||||
'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
|
||||
'packages/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}',
|
||||
'scripts/**/*.{test,spec}.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'
|
||||
],
|
||||
coverage: {
|
||||
reporter: ['text', 'json', 'html']
|
||||
|
||||
Reference in New Issue
Block a user