feat: upgrade QA pipeline to Gemini 3.x models

- qa-record.ts, qa-analyze-pr.ts: gemini-2.5-flash/pro → gemini-3.1-pro-preview
- qa-video-review.ts, qa-generate-test.ts: gemini-2.5-flash → gemini-3-flash-preview
- pr-qa.yaml: update hardcoded model reference
- Add docs/qa/models.md with model comparison and rationale
This commit is contained in:
snomiao
2026-03-24 04:51:42 +00:00
parent 27c64e1092
commit 79df405733
6 changed files with 67 additions and 9 deletions

View File

@@ -672,7 +672,7 @@ jobs:
--artifacts-dir qa-artifacts \
--output-dir video-reviews \
--video-file "$vid" \
--model gemini-2.5-flash $PR_CTX_FLAG $BEFORE_FLAG $TARGET_URL_FLAG || true
--model gemini-3-flash-preview $PR_CTX_FLAG $BEFORE_FLAG $TARGET_URL_FLAG || true
echo "::endgroup::"
done

58
docs/qa/models.md Normal file
View File

@@ -0,0 +1,58 @@
# QA Pipeline Model Selection
## Current Configuration
| Script | Role | Model | Why |
|---|---|---|---|
| `qa-analyze-pr.ts` | PR/issue analysis, QA guide generation | `gemini-3.1-pro-preview` | Needs deep reasoning over PR diffs, screenshots, and issue threads |
| `qa-record.ts` | Playwright step generation | `gemini-3.1-pro-preview` | Step quality is critical — must understand ComfyUI's canvas UI and produce precise action sequences |
| `qa-video-review.ts` | Video comparison review | `gemini-3-flash-preview` | Video analysis with structured output; flash is sufficient and faster |
| `qa-generate-test.ts` | Regression test generation | `gemini-3-flash-preview` | Code generation from QA reports; flash handles this well |
## Model Comparison
### Gemini 3.1 Pro vs GPT-5.4
| | Gemini 3.1 Pro Preview | GPT-5.4 |
|---|---|---|
| Context window | 1M tokens | 1M tokens |
| Max output | 65K tokens | 128K tokens |
| Video input | Yes | No |
| Image input | Yes | Yes |
| Audio input | Yes | No |
| Pricing (input) | $2/1M tokens | $2.50/1M tokens |
| Pricing (output) | $12/1M tokens | $15/1M tokens |
| Function calling | Yes | Yes |
| Code execution | Yes | Yes (interpreter) |
| Structured output | Yes | Yes |
**Why Gemini over GPT for QA:**
- Native video understanding (can review recordings directly)
- Lower cost at comparable quality
- Native multimodal input (screenshots, videos, audio from issue threads)
- Better price/performance for high-volume CI usage
### Gemini 3 Flash vs GPT-5.4 Mini
| | Gemini 3 Flash Preview | GPT-5.4 Mini |
|---|---|---|
| Context window | 1M tokens | 1M tokens |
| Pricing (input) | $0.50/1M tokens | $0.40/1M tokens |
| Pricing (output) | $3/1M tokens | $1.60/1M tokens |
| Video input | Yes | No |
**Why Gemini Flash for video review:**
- Video input support is required — GPT models cannot process video files
- Good enough quality for structured comparison reports
## Upgrade History
| Date | Change | Reason |
|---|---|---|
| 2026-03-24 | `gemini-2.5-flash``gemini-3.1-pro-preview` (record) | Shallow step generation; pro model needed for complex ComfyUI interactions |
| 2026-03-24 | `gemini-2.5-pro``gemini-3.1-pro-preview` (analyze) | Keep analysis on latest pro |
| 2026-03-24 | `gemini-2.5-flash``gemini-3-flash-preview` (review, test-gen) | Latest flash for cost-efficient tasks |
## Override
All scripts accept `--model <name>` to override the default. Pass any Gemini model ID.

View File

@@ -11,7 +11,7 @@
* --pr-number 10270 \
* --repo owner/repo \
* --output-dir qa-guides/ \
* [--model gemini-2.5-pro]
* [--model gemini-3.1-pro-preview]
*
* Env: GEMINI_API_KEY (required)
*/
@@ -67,7 +67,7 @@ interface Options {
function parseArgs(): Options {
const args = process.argv.slice(2)
const opts: Partial<Options> = {
model: 'gemini-2.5-pro',
model: 'gemini-3.1-pro-preview',
apiKey: process.env.GEMINI_API_KEY || '',
mediaBudgetBytes: 20 * 1024 * 1024,
maxVideoBytes: 10 * 1024 * 1024,

View File

@@ -8,7 +8,7 @@
* --qa-report <path> QA video review report (markdown)
* --pr-diff <path> PR diff file
* --output <path> Output .spec.ts file path
* --model <name> Gemini model (default: gemini-2.5-flash)
* --model <name> Gemini model (default: gemini-3-flash-preview)
*/
import { readFile, writeFile } from 'node:fs/promises'
import { basename, resolve } from 'node:path'
@@ -26,7 +26,7 @@ const DEFAULTS: CliOptions = {
qaReport: '',
prDiff: '',
output: '',
model: 'gemini-2.5-flash'
model: 'gemini-3-flash-preview'
}
// ── Fixture API reference for the prompt ────────────────────────────
@@ -171,7 +171,7 @@ Options:
--qa-report <path> QA video review report (markdown) [required]
--pr-diff <path> PR diff file [required]
--output <path> Output .spec.ts path [required]
--model <name> Gemini model (default: gemini-2.5-flash)`)
--model <name> Gemini model (default: gemini-3-flash-preview)`)
process.exit(0)
}
}

View File

@@ -65,7 +65,7 @@ interface Options {
function parseArgs(): Options {
const args = process.argv.slice(2)
const opts: Partial<Options> = {
model: 'gemini-2.5-flash',
model: 'gemini-3.1-pro-preview',
serverUrl: 'http://127.0.0.1:8188',
apiKey: process.env.GEMINI_API_KEY || ''
}

View File

@@ -29,7 +29,7 @@ const DEFAULT_OPTIONS: CliOptions = {
videoFile: '',
beforeVideo: '',
outputDir: './tmp',
model: 'gemini-2.5-flash',
model: 'gemini-3-flash-preview',
requestTimeoutMs: 300_000,
dryRun: false,
prContext: '',
@@ -50,7 +50,7 @@ Options:
--output-dir <path> Output directory for markdown reports
(default: ./tmp)
--model <name> Gemini model
(default: gemini-2.5-flash)
(default: gemini-3-flash-preview)
--request-timeout-ms <n> Request timeout in milliseconds
(default: 300000)
--pr-context <file> File with PR context (title, body, diff)