mirror of
https://github.com/Comfy-Org/ComfyUI_frontend.git
synced 2026-04-20 06:20:11 +00:00
feat: three-phase QA pipeline — Research → Reproduce → Report
Phase 1 (qa-agent.ts): Claude investigates via a11y API only. - No video, no Gemini vision — only page.accessibility.snapshot() - Every action logged with a11y before/after state - done() requires evidence citing inspect() results - Outputs reproduction plan for Phase 2 Phase 2 (qa-reproduce.ts): Deterministic replay of research plan. - Executes each step with a11y assertions - Gemini describes visual changes (narration for humans) - Clean focused video with subtitles Phase 3: Report job reads research-log.json for verdict (ground truth), narration-log.json for descriptions, video for visuals. Gemini formats logs into report — never determines verdict. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,166 +1,57 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Hybrid QA Agent — Claude Sonnet 4.6 brain + Gemini 3.1 Pro eyes
|
||||
* QA Research Phase — Claude Sonnet 4.6 investigates via a11y API
|
||||
*
|
||||
* Claude plans and reasons. Gemini watches the video buffer and describes
|
||||
* what it sees. The agent uses 4 tools:
|
||||
* - observe(seconds, focus) — Gemini reviews last N seconds of video
|
||||
* - inspect(selector) — search accessibility tree for element state
|
||||
* - perform(action, params) — execute Playwright action
|
||||
* - done(verdict, summary) — finish with result
|
||||
* Claude explores the UI using accessibility tree assertions as ground truth.
|
||||
* NO video, NO Gemini vision — only DOM state via page.accessibility.snapshot().
|
||||
*
|
||||
* Tools:
|
||||
* - inspect(selector) — search a11y tree for element state (source of truth)
|
||||
* - perform(action, params) — execute Playwright action + auto-log a11y before/after
|
||||
* - done(verdict, summary, reproductionPlan) — finish with evidence-backed conclusion
|
||||
*/
|
||||
|
||||
import type { Page } from '@playwright/test'
|
||||
import { query, tool, createSdkMcpServer } from '@anthropic-ai/claude-agent-sdk'
|
||||
import { GoogleGenerativeAI } from '@google/generative-ai'
|
||||
import { z } from 'zod'
|
||||
import { execSync } from 'child_process'
|
||||
import { mkdirSync, writeFileSync, readFileSync } from 'fs'
|
||||
import { mkdirSync, writeFileSync } from 'fs'
|
||||
|
||||
// ── Types ──
|
||||
|
||||
interface AgentOptions {
|
||||
interface ResearchOptions {
|
||||
page: Page
|
||||
issueContext: string
|
||||
qaGuide: string
|
||||
outputDir: string
|
||||
geminiApiKey: string
|
||||
anthropicApiKey?: string // Optional — Agent SDK auto-detects Claude Code session
|
||||
anthropicApiKey?: string
|
||||
maxTurns?: number
|
||||
timeBudgetMs?: number
|
||||
}
|
||||
|
||||
interface ScreenshotFrame {
|
||||
export interface ResearchTurn {
|
||||
turn: number
|
||||
timestampMs: number
|
||||
base64: string
|
||||
toolName: string
|
||||
toolInput: unknown
|
||||
toolResult: string
|
||||
a11yBefore?: unknown
|
||||
a11yAfter?: unknown
|
||||
}
|
||||
|
||||
// ── Video buffer ──
|
||||
|
||||
const FRAME_INTERVAL_MS = 2000
|
||||
const MAX_BUFFER_FRAMES = 30 // 60 seconds at 2fps
|
||||
|
||||
class VideoBuffer {
|
||||
private frames: ScreenshotFrame[] = []
|
||||
private startMs = Date.now()
|
||||
private intervalId: ReturnType<typeof setInterval> | null = null
|
||||
private page: Page
|
||||
|
||||
constructor(page: Page) {
|
||||
this.page = page
|
||||
}
|
||||
|
||||
start() {
|
||||
this.startMs = Date.now()
|
||||
this.intervalId = setInterval(async () => {
|
||||
try {
|
||||
const buf = await this.page.screenshot({
|
||||
type: 'jpeg',
|
||||
quality: 60
|
||||
})
|
||||
this.frames.push({
|
||||
timestampMs: Date.now() - this.startMs,
|
||||
base64: buf.toString('base64')
|
||||
})
|
||||
if (this.frames.length > MAX_BUFFER_FRAMES) {
|
||||
this.frames.shift()
|
||||
}
|
||||
} catch {
|
||||
// page may be navigating
|
||||
}
|
||||
}, FRAME_INTERVAL_MS)
|
||||
}
|
||||
|
||||
stop() {
|
||||
if (this.intervalId) clearInterval(this.intervalId)
|
||||
}
|
||||
|
||||
getLastFrames(seconds: number): ScreenshotFrame[] {
|
||||
const cutoffMs = Date.now() - this.startMs - seconds * 1000
|
||||
return this.frames.filter((f) => f.timestampMs >= cutoffMs)
|
||||
}
|
||||
|
||||
async buildVideoClip(
|
||||
seconds: number,
|
||||
outputDir: string
|
||||
): Promise<Buffer | null> {
|
||||
const frames = this.getLastFrames(seconds)
|
||||
if (frames.length < 2) return null
|
||||
|
||||
const clipDir = `${outputDir}/.clip-frames`
|
||||
mkdirSync(clipDir, { recursive: true })
|
||||
|
||||
// Write frames as numbered JPEGs
|
||||
for (let i = 0; i < frames.length; i++) {
|
||||
writeFileSync(
|
||||
`${clipDir}/frame-${String(i).padStart(4, '0')}.jpg`,
|
||||
Buffer.from(frames[i].base64, 'base64')
|
||||
)
|
||||
}
|
||||
|
||||
// Compose into video with ffmpeg
|
||||
const clipPath = `${outputDir}/.observe-clip.mp4`
|
||||
try {
|
||||
const fps = Math.max(1, Math.round(frames.length / seconds))
|
||||
execSync(
|
||||
`ffmpeg -y -framerate ${fps} -i "${clipDir}/frame-%04d.jpg" ` +
|
||||
`-c:v libx264 -preset ultrafast -pix_fmt yuv420p "${clipPath}" 2>/dev/null`,
|
||||
{ timeout: 10000 }
|
||||
)
|
||||
return readFileSync(clipPath)
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
export interface ReproductionStep {
|
||||
action: Record<string, unknown> & { action: string }
|
||||
expectedAssertion: string
|
||||
}
|
||||
|
||||
// ── Gemini Vision ──
|
||||
|
||||
async function geminiObserve(
|
||||
videoBuffer: VideoBuffer,
|
||||
seconds: number,
|
||||
focus: string,
|
||||
outputDir: string,
|
||||
geminiApiKey: string
|
||||
): Promise<string> {
|
||||
const genAI = new GoogleGenerativeAI(geminiApiKey)
|
||||
const model = genAI.getGenerativeModel({
|
||||
model: 'gemini-3-flash-preview'
|
||||
})
|
||||
|
||||
// Try video clip first, fall back to last frame
|
||||
const clip = await videoBuffer.buildVideoClip(seconds, outputDir)
|
||||
|
||||
const parts: Array<
|
||||
{ text: string } | { inlineData: { mimeType: string; data: string } }
|
||||
> = [
|
||||
{
|
||||
text: `You are observing a ComfyUI frontend session. Focus on: ${focus}\n\nDescribe what happened in the last ${seconds} seconds. Be specific about UI state, actions taken, and results.`
|
||||
}
|
||||
]
|
||||
|
||||
if (clip) {
|
||||
parts.push({
|
||||
inlineData: { mimeType: 'video/mp4', data: clip.toString('base64') }
|
||||
})
|
||||
} else {
|
||||
// Fall back to last frame
|
||||
const frames = videoBuffer.getLastFrames(seconds)
|
||||
if (frames.length > 0) {
|
||||
parts.push({
|
||||
inlineData: {
|
||||
mimeType: 'image/jpeg',
|
||||
data: frames[frames.length - 1].base64
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const result = await model.generateContent(parts)
|
||||
return result.response.text().trim()
|
||||
export interface ResearchResult {
|
||||
verdict: 'REPRODUCED' | 'NOT_REPRODUCIBLE' | 'INCONCLUSIVE'
|
||||
summary: string
|
||||
evidence: string
|
||||
reproductionPlan: ReproductionStep[]
|
||||
log: ResearchTurn[]
|
||||
}
|
||||
|
||||
// ── Accessibility tree helpers ──
|
||||
// ── A11y helpers ──
|
||||
|
||||
interface A11yNode {
|
||||
role: string
|
||||
@@ -174,16 +65,12 @@ interface A11yNode {
|
||||
function searchA11y(node: A11yNode | null, selector: string): A11yNode | null {
|
||||
if (!node) return null
|
||||
const sel = selector.toLowerCase()
|
||||
|
||||
// Match by name or role
|
||||
if (
|
||||
node.name?.toLowerCase().includes(sel) ||
|
||||
node.role?.toLowerCase().includes(sel)
|
||||
) {
|
||||
return node
|
||||
}
|
||||
|
||||
// Recurse into children
|
||||
if (node.children) {
|
||||
for (const child of node.children) {
|
||||
const found = searchA11y(child, selector)
|
||||
@@ -213,174 +100,77 @@ function flattenA11y(node: A11yNode | null, depth = 0): string {
|
||||
return parts.filter(Boolean).join('\n')
|
||||
}
|
||||
|
||||
// ── Subtitle overlay ──
|
||||
// ── Main research function ──
|
||||
|
||||
async function showSubtitle(page: Page, text: string, turn: number) {
|
||||
const encoded = encodeURIComponent(
|
||||
text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
|
||||
)
|
||||
await page.addScriptTag({
|
||||
content: `(function(){
|
||||
var id='qa-subtitle';
|
||||
var el=document.getElementById(id);
|
||||
if(!el){
|
||||
el=document.createElement('div');
|
||||
el.id=id;
|
||||
Object.assign(el.style,{position:'fixed',bottom:'32px',left:'50%',transform:'translateX(-50%)',zIndex:'2147483646',maxWidth:'90%',padding:'6px 14px',borderRadius:'6px',background:'rgba(0,0,0,0.8)',color:'rgba(255,255,255,0.95)',fontSize:'12px',fontFamily:'system-ui,sans-serif',fontWeight:'400',lineHeight:'1.4',pointerEvents:'none',textAlign:'center',transition:'opacity 0.3s',whiteSpace:'normal'});
|
||||
document.body.appendChild(el);
|
||||
}
|
||||
var msg=decodeURIComponent('${encoded}');
|
||||
el.textContent='['+${turn}+'] '+msg;
|
||||
el.style.opacity='1';
|
||||
})()`
|
||||
})
|
||||
}
|
||||
export async function runResearchPhase(
|
||||
opts: ResearchOptions
|
||||
): Promise<ResearchResult> {
|
||||
const { page, issueContext, qaGuide, outputDir, anthropicApiKey } = opts
|
||||
const maxTurns = opts.maxTurns ?? 40
|
||||
const timeBudgetMs = opts.timeBudgetMs ?? 180_000
|
||||
|
||||
// ── Main agent ──
|
||||
|
||||
export async function runHybridAgent(opts: AgentOptions): Promise<{
|
||||
verdict: string
|
||||
summary: string
|
||||
}> {
|
||||
const {
|
||||
page,
|
||||
issueContext,
|
||||
qaGuide,
|
||||
outputDir,
|
||||
geminiApiKey,
|
||||
anthropicApiKey
|
||||
} = opts
|
||||
const maxTurns = opts.maxTurns ?? 30
|
||||
const timeBudgetMs = opts.timeBudgetMs ?? 120_000
|
||||
|
||||
// Start video buffer
|
||||
const videoBuffer = new VideoBuffer(page)
|
||||
videoBuffer.start()
|
||||
|
||||
let lastA11ySnapshot: A11yNode | null = null
|
||||
let agentDone = false
|
||||
let finalVerdict = 'INCONCLUSIVE'
|
||||
let finalVerdict: ResearchResult['verdict'] = 'INCONCLUSIVE'
|
||||
let finalSummary = 'Agent did not complete'
|
||||
let finalEvidence = ''
|
||||
let finalPlan: ReproductionStep[] = []
|
||||
let turnCount = 0
|
||||
const startTime = Date.now()
|
||||
const researchLog: ResearchTurn[] = []
|
||||
|
||||
// Import executeAction from qa-record.ts (shared Playwright helpers)
|
||||
// For now, inline the action execution
|
||||
const { executeAction } = await import('./qa-record.js')
|
||||
|
||||
// Define tools
|
||||
const observeTool = tool(
|
||||
'observe',
|
||||
'Watch the last N seconds of screen recording through Gemini vision. Use this to verify visual state, check if actions had visible effect, or inspect visual bugs. Pass a focused question so Gemini knows what to look for.',
|
||||
{
|
||||
seconds: z
|
||||
.number()
|
||||
.min(3)
|
||||
.max(60)
|
||||
.default(10)
|
||||
.describe('How many seconds to look back'),
|
||||
focus: z
|
||||
.string()
|
||||
.describe(
|
||||
'What to look for — be specific, e.g. "Did the Nodes 2.0 toggle switch to ON?"'
|
||||
)
|
||||
},
|
||||
async (args) => {
|
||||
const description = await geminiObserve(
|
||||
videoBuffer,
|
||||
args.seconds,
|
||||
args.focus,
|
||||
outputDir,
|
||||
geminiApiKey
|
||||
)
|
||||
return { content: [{ type: 'text' as const, text: description }] }
|
||||
}
|
||||
)
|
||||
|
||||
// ── Tool: inspect ──
|
||||
const inspectTool = tool(
|
||||
'inspect',
|
||||
'Search the accessibility tree for a specific UI element. Returns its role, name, value, checked state. Fast and precise — use this to verify element state without vision.',
|
||||
'Search the accessibility tree for a UI element. Returns role, name, value, checked state. This is your SOURCE OF TRUTH — use it after every action to verify state.',
|
||||
{
|
||||
selector: z
|
||||
.string()
|
||||
.describe(
|
||||
'Element name or role to search for, e.g. "Nodes 2.0", "KSampler seed", "Run button"'
|
||||
'Element name or role to search for, e.g. "Settings", "Language", "KSampler seed", "tab"'
|
||||
)
|
||||
},
|
||||
async (args) => {
|
||||
try {
|
||||
const snapshot =
|
||||
(await page.accessibility.snapshot()) as A11yNode | null
|
||||
lastA11ySnapshot = snapshot
|
||||
const found = searchA11y(snapshot, args.selector)
|
||||
if (found) {
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: JSON.stringify({
|
||||
role: found.role,
|
||||
name: found.name,
|
||||
value: found.value,
|
||||
checked: found.checked,
|
||||
disabled: found.disabled,
|
||||
hasChildren: Boolean(found.children?.length)
|
||||
})
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
// Return nearby elements if exact match not found
|
||||
const tree = flattenA11y(snapshot, 0).slice(0, 2000)
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `Element "${args.selector}" not found. Available elements:\n${tree}`
|
||||
}
|
||||
]
|
||||
}
|
||||
} catch (e) {
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `inspect failed: ${e instanceof Error ? e.message : e}`
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
const snapshot = (await page.accessibility.snapshot()) as A11yNode | null
|
||||
const found = searchA11y(snapshot, args.selector)
|
||||
|
||||
const resultText = found
|
||||
? JSON.stringify({
|
||||
role: found.role,
|
||||
name: found.name,
|
||||
value: found.value,
|
||||
checked: found.checked,
|
||||
disabled: found.disabled,
|
||||
hasChildren: Boolean(found.children?.length)
|
||||
})
|
||||
: `Element "${args.selector}" not found. Available:\n${flattenA11y(snapshot, 0).slice(0, 2000)}`
|
||||
|
||||
researchLog.push({
|
||||
turn: turnCount,
|
||||
timestampMs: Date.now() - startTime,
|
||||
toolName: 'inspect',
|
||||
toolInput: args,
|
||||
toolResult: resultText.slice(0, 500)
|
||||
})
|
||||
|
||||
return { content: [{ type: 'text' as const, text: resultText }] }
|
||||
}
|
||||
)
|
||||
|
||||
// ── Tool: perform ──
|
||||
const performTool = tool(
|
||||
'perform',
|
||||
`Execute a Playwright action on the ComfyUI page. Available actions:
|
||||
- click(text): click element by visible text
|
||||
- clickCanvas(x, y): click at coordinates
|
||||
- rightClickCanvas(x, y): right-click at coordinates
|
||||
- doubleClick(x, y): double-click at coordinates
|
||||
- dragCanvas(fromX, fromY, toX, toY): drag between points
|
||||
- scrollCanvas(x, y, deltaY): scroll wheel (negative=zoom in)
|
||||
- pressKey(key): press keyboard key (Escape, Enter, Delete, Control+c, etc.)
|
||||
- fillDialog(text): fill input and press Enter
|
||||
- openMenu(): open hamburger menu
|
||||
- hoverMenuItem(label): hover menu item
|
||||
- clickMenuItem(label): click submenu item
|
||||
- setSetting(id, value): change a ComfyUI setting
|
||||
- loadDefaultWorkflow(): load the 7-node default workflow
|
||||
- openSettings(): open Settings dialog
|
||||
- reload(): reload the page
|
||||
- addNode(nodeName, x, y): add a node via search
|
||||
- copyPaste(x, y): Ctrl+C then Ctrl+V at coords
|
||||
- holdKeyAndDrag(key, fromX, fromY, toX, toY): hold key while dragging
|
||||
- screenshot(name): take a named screenshot`,
|
||||
`Execute a Playwright action. Auto-captures a11y state before and after.
|
||||
Available: click(text), clickCanvas(x,y), rightClickCanvas(x,y), doubleClick(x,y),
|
||||
dragCanvas(fromX,fromY,toX,toY), scrollCanvas(x,y,deltaY), pressKey(key),
|
||||
fillDialog(text), openMenu(), hoverMenuItem(label), clickMenuItem(label),
|
||||
setSetting(id,value), loadDefaultWorkflow(), openSettings(), reload(),
|
||||
addNode(nodeName,x,y), copyPaste(x,y), holdKeyAndDrag(key,fromX,fromY,toX,toY),
|
||||
screenshot(name)`,
|
||||
{
|
||||
action: z.string().describe('Action name'),
|
||||
params: z
|
||||
.record(z.unknown())
|
||||
.optional()
|
||||
.describe('Action parameters as key-value pairs')
|
||||
params: z.record(z.unknown()).optional().describe('Action parameters')
|
||||
},
|
||||
async (args) => {
|
||||
turnCount++
|
||||
@@ -389,152 +179,175 @@ export async function runHybridAgent(opts: AgentOptions): Promise<{
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `Budget exceeded (${turnCount}/${maxTurns} turns, ${Math.round((Date.now() - startTime) / 1000)}s). Use done() now.`
|
||||
text: `Budget exceeded (${turnCount}/${maxTurns} turns, ${Math.round((Date.now() - startTime) / 1000)}s). Call done() NOW with your current findings.`
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
// Build TestAction object from args
|
||||
const actionObj = { action: args.action, ...args.params } as Parameters<
|
||||
typeof executeAction
|
||||
>[1]
|
||||
// Capture a11y BEFORE
|
||||
const a11yBefore = await page.accessibility.snapshot().catch(() => null)
|
||||
|
||||
const actionObj = {
|
||||
action: args.action,
|
||||
...args.params
|
||||
} as Parameters<typeof executeAction>[1]
|
||||
|
||||
let resultText: string
|
||||
try {
|
||||
const result = await executeAction(page, actionObj, outputDir)
|
||||
// Show subtitle
|
||||
await showSubtitle(
|
||||
page,
|
||||
`${args.action}: ${result.success ? 'OK' : result.error}`,
|
||||
turnCount
|
||||
)
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: result.success
|
||||
? `Action "${args.action}" succeeded.`
|
||||
: `Action "${args.action}" FAILED: ${result.error}`
|
||||
}
|
||||
]
|
||||
}
|
||||
resultText = result.success
|
||||
? `Action "${args.action}" succeeded.`
|
||||
: `Action "${args.action}" FAILED: ${result.error}`
|
||||
} catch (e) {
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `Action "${args.action}" threw: ${e instanceof Error ? e.message : e}`
|
||||
}
|
||||
]
|
||||
}
|
||||
resultText = `Action "${args.action}" threw: ${e instanceof Error ? e.message : e}`
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
const doneTool = tool(
|
||||
'done',
|
||||
'Signal that reproduction is complete. Call this when you have either confirmed the bug or determined it cannot be reproduced.',
|
||||
{
|
||||
verdict: z
|
||||
.enum(['REPRODUCED', 'NOT_REPRODUCIBLE', 'INCONCLUSIVE'])
|
||||
.describe('Final verdict'),
|
||||
summary: z
|
||||
.string()
|
||||
.describe(
|
||||
'One paragraph: what you did, what you observed, and why you reached this verdict'
|
||||
)
|
||||
},
|
||||
async (args) => {
|
||||
agentDone = true
|
||||
finalVerdict = args.verdict
|
||||
finalSummary = args.summary
|
||||
await showSubtitle(page, `DONE: ${args.verdict}`, turnCount)
|
||||
// Capture a11y AFTER
|
||||
const a11yAfter = await page.accessibility.snapshot().catch(() => null)
|
||||
|
||||
researchLog.push({
|
||||
turn: turnCount,
|
||||
timestampMs: Date.now() - startTime,
|
||||
toolName: 'perform',
|
||||
toolInput: args,
|
||||
toolResult: resultText,
|
||||
a11yBefore,
|
||||
a11yAfter
|
||||
})
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `Agent finished: ${args.verdict}`
|
||||
text: `${resultText}\n\n(a11y state captured before/after — use inspect() to verify specific elements)`
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
// Create MCP server with our tools
|
||||
// ── Tool: done ──
|
||||
const doneTool = tool(
|
||||
'done',
|
||||
'Finish the research with an evidence-backed verdict and a reproduction plan.',
|
||||
{
|
||||
verdict: z
|
||||
.enum(['REPRODUCED', 'NOT_REPRODUCIBLE', 'INCONCLUSIVE'])
|
||||
.describe('Final verdict — MUST be supported by inspect() evidence'),
|
||||
summary: z
|
||||
.string()
|
||||
.describe(
|
||||
'What you did, what inspect() showed, and why you reached this verdict'
|
||||
),
|
||||
evidence: z
|
||||
.string()
|
||||
.describe(
|
||||
'Cite specific inspect() results: "inspect(X) returned {Y} proving Z"'
|
||||
),
|
||||
reproductionPlan: z
|
||||
.array(
|
||||
z.object({
|
||||
action: z
|
||||
.record(z.unknown())
|
||||
.describe('Action object with "action" field + params'),
|
||||
expectedAssertion: z
|
||||
.string()
|
||||
.describe(
|
||||
'Expected a11y state after this action, e.g. "Settings dialog: visible" or "tab count: 2"'
|
||||
)
|
||||
})
|
||||
)
|
||||
.describe(
|
||||
'Minimal ordered steps to reproduce the bug. Empty if NOT_REPRODUCIBLE/INCONCLUSIVE.'
|
||||
)
|
||||
},
|
||||
async (args) => {
|
||||
agentDone = true
|
||||
finalVerdict = args.verdict
|
||||
finalSummary = args.summary
|
||||
finalEvidence = args.evidence
|
||||
finalPlan = args.reproductionPlan.map((s) => ({
|
||||
action: s.action as ReproductionStep['action'],
|
||||
expectedAssertion: s.expectedAssertion
|
||||
}))
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text' as const,
|
||||
text: `Research complete: ${args.verdict}`
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
// ── MCP Server ──
|
||||
const server = createSdkMcpServer({
|
||||
name: 'qa-agent',
|
||||
name: 'qa-research',
|
||||
version: '1.0.0',
|
||||
tools: [observeTool, inspectTool, performTool, doneTool]
|
||||
tools: [inspectTool, performTool, doneTool]
|
||||
})
|
||||
|
||||
// Build system prompt
|
||||
const systemPrompt = `You are a senior QA engineer reproducing a reported bug in ComfyUI, a node-based AI image generation tool.
|
||||
// ── System prompt ──
|
||||
const systemPrompt = `You are a senior QA engineer investigating a reported bug in ComfyUI.
|
||||
|
||||
## Your tools
|
||||
- observe(seconds, focus) — Gemini AI watches the last N seconds of screen recording and answers your focused question. Use for visual verification.
|
||||
- inspect(selector) — Search the accessibility tree for a specific element's state. Use for precise state checks (toggle on/off, value, disabled).
|
||||
- perform(action, params) — Execute a Playwright action on the browser.
|
||||
- done(verdict, summary) — Finish with your conclusion.
|
||||
## Your tools (3 only — no vision)
|
||||
- inspect(selector) — Search accessibility tree for element state. THIS IS YOUR SOURCE OF TRUTH.
|
||||
- perform(action, params) — Execute a Playwright action. Auto-captures a11y before/after.
|
||||
- done(verdict, summary, evidence, reproductionPlan) — Finish with evidence-backed conclusion.
|
||||
|
||||
## Rules (CRITICAL)
|
||||
1. After EVERY perform() call, use inspect() to verify the DOM state changed as expected.
|
||||
2. Your verdict MUST cite specific inspect() results as evidence.
|
||||
3. NEVER claim REPRODUCED unless inspect() confirms the broken state.
|
||||
4. NEVER claim NOT_REPRODUCIBLE unless you actually performed all the reproduction steps and inspect() shows normal behavior.
|
||||
5. If you run out of time before completing steps, verdict is INCONCLUSIVE.
|
||||
6. Complete ALL reproduction steps. Setup (loading workflow, opening settings) is NOT reproduction — the actual bug trigger is.
|
||||
|
||||
## Output
|
||||
When you call done(), include:
|
||||
- verdict: based on what inspect() showed, not what you expected
|
||||
- evidence: "inspect('Settings dialog') returned {role: dialog, name: Settings} — dialog still visible after Escape, proving the bug does NOT exist on this build"
|
||||
- reproductionPlan: minimal steps that demonstrate the bug (for the reproduce phase to replay as a clean video)
|
||||
|
||||
## Strategy
|
||||
1. Start by understanding the issue, then plan your FULL reproduction sequence before acting.
|
||||
2. Use perform() to take actions. After EVERY action, use inspect() to verify the DOM state changed as expected.
|
||||
3. If a setting change doesn't seem to take effect, try reload() then verify again.
|
||||
4. Focus on the specific bug — don't explore randomly.
|
||||
5. Take screenshots at key moments for the video evidence.
|
||||
6. When you've confirmed or ruled out the bug, call done().
|
||||
7. You MUST complete ALL reproduction steps. Do NOT stop after setup — the actual bug trigger is the most important part.
|
||||
1. Read the issue carefully. Plan the FULL reproduction sequence.
|
||||
2. Set up prerequisites (load workflow, open settings, etc.)
|
||||
3. Perform the actual bug trigger (the specific interaction described in the issue)
|
||||
4. Verify the result with inspect() — is the state broken or correct?
|
||||
5. If the bug is triggered by a setting/mode, do control/test comparison:
|
||||
- CONTROL: perform action with setting OFF → inspect() → should work
|
||||
- TEST: perform action with setting ON → inspect() → should break
|
||||
|
||||
## Verification Rules (CRITICAL — prevents false results)
|
||||
- NEVER claim REPRODUCED unless inspect() confirms the expected broken state exists
|
||||
- After EVERY action, call inspect() to verify the DOM state. This is your source of truth.
|
||||
- Your done() verdict MUST be supported by inspect() results, not by what you think happened
|
||||
- If you perform an action but inspect() shows no state change, the action FAILED — try again or adapt
|
||||
- Example: if you press Escape and inspect("Settings dialog") still returns visible → the dialog did NOT close → report that honestly
|
||||
- ALWAYS include inspect() evidence in your done() summary: "inspect('X') returned {state} confirming Y"
|
||||
|
||||
## Control/Test Comparison (IMPORTANT)
|
||||
When a bug is triggered by a specific setting, mode, or configuration:
|
||||
1. **CONTROL phase**: First demonstrate the WORKING state. Disable the trigger (e.g., Nodes 2.0 OFF), perform the action, take a screenshot labeled "control-*", verify it works.
|
||||
2. **TEST phase**: Then enable the trigger (e.g., Nodes 2.0 ON), reload if needed, perform the SAME action, take a screenshot labeled "test-*", verify it's broken.
|
||||
3. In your done() summary, explicitly compare: "With X OFF, behavior was Y. With X ON, behavior was Z."
|
||||
|
||||
This contrast is critical evidence — it proves the bug is caused by the specific setting, not a general issue. Always try to show both states when possible.
|
||||
|
||||
Examples of control/test pairs:
|
||||
- Nodes 2.0 OFF → ON (for node rendering, widget, drag bugs)
|
||||
- Default theme → specific theme (for visual bugs)
|
||||
- Single node → multiple overlapping nodes (for z-index bugs)
|
||||
- Empty workflow → loaded workflow (for state bugs)
|
||||
|
||||
## ComfyUI Layout (1280×720 viewport)
|
||||
- Canvas with node graph centered at ~(640, 400)
|
||||
- Hamburger menu top-left (C logo)
|
||||
## ComfyUI Layout (1280×720)
|
||||
- Canvas centered at ~(640, 400)
|
||||
- Hamburger menu (top-left C logo) → File, Edit, View, Theme, Help
|
||||
- Sidebar: Workflows, Node Library, Models
|
||||
- Default workflow nodes: Load Checkpoint (~150,300), CLIP Text Encode (~450,250/450), Empty Latent (~450,600), KSampler (~750,350), VAE Decode (~1000,350), Save Image (~1200,350)
|
||||
- Default workflow: Load Checkpoint (~150,300), CLIP Text Encode (~450,250/450), KSampler (~750,350)
|
||||
|
||||
${qaGuide ? `## QA Guide\n${qaGuide}\n` : ''}
|
||||
## Issue to Reproduce
|
||||
${issueContext}`
|
||||
|
||||
// Run the agent
|
||||
console.warn('Starting hybrid agent (Claude Sonnet 4.6 + Gemini vision)...')
|
||||
// ── Run the agent ──
|
||||
console.warn('Starting research phase (Claude + a11y)...')
|
||||
|
||||
try {
|
||||
for await (const message of query({
|
||||
prompt:
|
||||
'Reproduce the reported bug. Start by reading the issue context in your system prompt, then use your tools to interact with the ComfyUI browser session.',
|
||||
'Investigate the reported bug. Use inspect() after every action to verify state. When done, call done() with evidence from inspect() results and a reproduction plan.',
|
||||
options: {
|
||||
model: 'claude-sonnet-4-6',
|
||||
systemPrompt,
|
||||
...(anthropicApiKey ? { apiKey: anthropicApiKey } : {}),
|
||||
maxTurns,
|
||||
mcpServers: { 'qa-agent': server },
|
||||
mcpServers: { 'qa-research': server },
|
||||
allowedTools: [
|
||||
'mcp__qa-agent__observe',
|
||||
'mcp__qa-agent__inspect',
|
||||
'mcp__qa-agent__perform',
|
||||
'mcp__qa-agent__done'
|
||||
'mcp__qa-research__inspect',
|
||||
'mcp__qa-research__perform',
|
||||
'mcp__qa-research__done'
|
||||
]
|
||||
}
|
||||
})) {
|
||||
@@ -553,10 +366,26 @@ ${issueContext}`
|
||||
if (agentDone) break
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`Agent error: ${e instanceof Error ? e.message : e}`)
|
||||
console.warn(`Research error: ${e instanceof Error ? e.message : e}`)
|
||||
}
|
||||
|
||||
videoBuffer.stop()
|
||||
// Save research log
|
||||
const result: ResearchResult = {
|
||||
verdict: finalVerdict,
|
||||
summary: finalSummary,
|
||||
evidence: finalEvidence,
|
||||
reproductionPlan: finalPlan,
|
||||
log: researchLog
|
||||
}
|
||||
|
||||
return { verdict: finalVerdict, summary: finalSummary }
|
||||
mkdirSync(`${outputDir}/research`, { recursive: true })
|
||||
writeFileSync(
|
||||
`${outputDir}/research/research-log.json`,
|
||||
JSON.stringify(result, null, 2)
|
||||
)
|
||||
console.warn(
|
||||
`Research complete: ${finalVerdict} (${researchLog.length} tool calls, ${finalPlan.length} reproduction steps)`
|
||||
)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -429,7 +429,7 @@ interface NarrationSegment {
|
||||
|
||||
// Collected during recording, used for TTS post-processing
|
||||
const narrationSegments: NarrationSegment[] = []
|
||||
let recordingStartMs = 0
|
||||
const recordingStartMs = 0
|
||||
|
||||
async function showSubtitle(page: Page, text: string, turn: number) {
|
||||
const safeText = text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
|
||||
@@ -1934,14 +1934,10 @@ async function main() {
|
||||
await page.screenshot({
|
||||
path: `${opts.outputDir}/debug-after-login-reproduce${sessionLabel}.png`
|
||||
})
|
||||
console.warn('Editor ready — starting agentic loop')
|
||||
recordingStartMs = Date.now()
|
||||
narrationSegments.length = 0
|
||||
|
||||
// Hybrid agent (Claude + Gemini)
|
||||
// Uses ANTHROPIC_API_KEY in CI, or Claude Code OAuth session locally
|
||||
// ═══ Phase 1: RESEARCH (Claude + a11y — no video needed) ═══
|
||||
console.warn('Phase 1: Research — Claude investigates via a11y API')
|
||||
const anthropicKey = process.env.ANTHROPIC_API_KEY
|
||||
const { runHybridAgent } = await import('./qa-agent.js')
|
||||
const { runResearchPhase } = await import('./qa-agent.js')
|
||||
const issueCtx = opts.diffFile
|
||||
? readFileSync(opts.diffFile, 'utf-8').slice(0, 6000)
|
||||
: 'No issue context provided'
|
||||
@@ -1953,17 +1949,39 @@ async function main() {
|
||||
// QA guide not available
|
||||
}
|
||||
}
|
||||
const result = await runHybridAgent({
|
||||
const research = await runResearchPhase({
|
||||
page,
|
||||
issueContext: issueCtx,
|
||||
qaGuide: qaGuideText,
|
||||
outputDir: opts.outputDir,
|
||||
geminiApiKey: opts.apiKey,
|
||||
anthropicApiKey: anthropicKey
|
||||
})
|
||||
console.warn(
|
||||
`Hybrid agent finished: ${result.verdict} — ${result.summary.slice(0, 100)}`
|
||||
`Research complete: ${research.verdict} — ${research.summary.slice(0, 100)}`
|
||||
)
|
||||
console.warn(`Evidence: ${research.evidence.slice(0, 200)}`)
|
||||
console.warn(
|
||||
`Reproduction plan: ${research.reproductionPlan.length} steps`
|
||||
)
|
||||
|
||||
// ═══ Phase 2: REPRODUCE (deterministic replay + narration) ═══
|
||||
if (
|
||||
research.verdict === 'REPRODUCED' &&
|
||||
research.reproductionPlan.length > 0
|
||||
) {
|
||||
console.warn('Phase 2: Reproduce — replaying plan with narration')
|
||||
const { runReproducePhase } = await import('./qa-reproduce.js')
|
||||
await runReproducePhase({
|
||||
page,
|
||||
plan: research.reproductionPlan,
|
||||
geminiApiKey: opts.apiKey,
|
||||
outputDir: opts.outputDir
|
||||
})
|
||||
} else {
|
||||
console.warn(
|
||||
`Skipping Phase 2: verdict=${research.verdict}, plan=${research.reproductionPlan.length} steps`
|
||||
)
|
||||
}
|
||||
await sleep(2000)
|
||||
} finally {
|
||||
await context.close()
|
||||
|
||||
247
scripts/qa-reproduce.ts
Normal file
247
scripts/qa-reproduce.ts
Normal file
@@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* QA Reproduce Phase — Deterministic replay of research plan with narration
|
||||
*
|
||||
* Takes a reproduction plan from the research phase and replays it:
|
||||
* 1. Execute each action deterministically (no AI decisions)
|
||||
* 2. Capture a11y snapshot before/after each action
|
||||
* 3. Gemini describes what visually changed (narration for humans)
|
||||
* 4. Output: narration-log.json with full evidence chain
|
||||
*/
|
||||
|
||||
import type { Page } from '@playwright/test'
|
||||
import { GoogleGenerativeAI } from '@google/generative-ai'
|
||||
import { mkdirSync, writeFileSync } from 'fs'
|
||||
|
||||
import type { ActionResult } from './qa-record.js'
|
||||
|
||||
// ── Types ──
|
||||
|
||||
interface ReproductionStep {
|
||||
action: Record<string, unknown> & { action: string }
|
||||
expectedAssertion: string
|
||||
}
|
||||
|
||||
interface NarrationEntry {
|
||||
step: number
|
||||
action: string
|
||||
params: Record<string, unknown>
|
||||
result: ActionResult
|
||||
a11yBefore: unknown
|
||||
a11yAfter: unknown
|
||||
assertionExpected: string
|
||||
assertionPassed: boolean
|
||||
assertionActual: string
|
||||
geminiNarration: string
|
||||
timestampMs: number
|
||||
}
|
||||
|
||||
export interface NarrationLog {
|
||||
entries: NarrationEntry[]
|
||||
allAssertionsPassed: boolean
|
||||
}
|
||||
|
||||
interface ReproduceOptions {
|
||||
page: Page
|
||||
plan: ReproductionStep[]
|
||||
geminiApiKey: string
|
||||
outputDir: string
|
||||
}
|
||||
|
||||
// ── A11y helpers ──
|
||||
|
||||
interface A11yNode {
|
||||
role: string
|
||||
name: string
|
||||
value?: string
|
||||
checked?: boolean
|
||||
disabled?: boolean
|
||||
expanded?: boolean
|
||||
children?: A11yNode[]
|
||||
}
|
||||
|
||||
function searchA11y(node: A11yNode | null, selector: string): A11yNode | null {
|
||||
if (!node) return null
|
||||
const sel = selector.toLowerCase()
|
||||
if (
|
||||
node.name?.toLowerCase().includes(sel) ||
|
||||
node.role?.toLowerCase().includes(sel)
|
||||
) {
|
||||
return node
|
||||
}
|
||||
if (node.children) {
|
||||
for (const child of node.children) {
|
||||
const found = searchA11y(child, selector)
|
||||
if (found) return found
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
function summarizeA11y(node: A11yNode | null): string {
|
||||
if (!node) return 'null'
|
||||
const parts = [`role=${node.role}`, `name="${node.name}"`]
|
||||
if (node.value !== undefined) parts.push(`value="${node.value}"`)
|
||||
if (node.checked !== undefined) parts.push(`checked=${node.checked}`)
|
||||
if (node.disabled) parts.push('disabled')
|
||||
if (node.expanded !== undefined) parts.push(`expanded=${node.expanded}`)
|
||||
return `{${parts.join(', ')}}`
|
||||
}
|
||||
|
||||
// ── Subtitle overlay ──
|
||||
|
||||
async function showSubtitle(page: Page, text: string, step: number) {
|
||||
const encoded = encodeURIComponent(
|
||||
text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
|
||||
)
|
||||
await page.addScriptTag({
|
||||
content: `(function(){
|
||||
var id='qa-subtitle';
|
||||
var el=document.getElementById(id);
|
||||
if(!el){
|
||||
el=document.createElement('div');
|
||||
el.id=id;
|
||||
Object.assign(el.style,{position:'fixed',bottom:'32px',left:'50%',transform:'translateX(-50%)',zIndex:'2147483646',maxWidth:'90%',padding:'6px 14px',borderRadius:'6px',background:'rgba(0,0,0,0.8)',color:'rgba(255,255,255,0.95)',fontSize:'12px',fontFamily:'system-ui,sans-serif',fontWeight:'400',lineHeight:'1.4',pointerEvents:'none',textAlign:'center',whiteSpace:'normal'});
|
||||
document.body.appendChild(el);
|
||||
}
|
||||
el.textContent='['+${step}+'] '+decodeURIComponent('${encoded}');
|
||||
})()`
|
||||
})
|
||||
}
|
||||
|
||||
// ── Gemini visual narration ──
|
||||
|
||||
async function geminiDescribe(
|
||||
page: Page,
|
||||
geminiApiKey: string,
|
||||
focus: string
|
||||
): Promise<string> {
|
||||
try {
|
||||
const screenshot = await page.screenshot({ type: 'jpeg', quality: 70 })
|
||||
const genAI = new GoogleGenerativeAI(geminiApiKey)
|
||||
const model = genAI.getGenerativeModel({ model: 'gemini-3-flash-preview' })
|
||||
|
||||
const result = await model.generateContent([
|
||||
{
|
||||
text: `Describe in 1-2 sentences what you see on this ComfyUI screen. Focus on: ${focus}. Be factual — only describe what is visible.`
|
||||
},
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: 'image/jpeg',
|
||||
data: screenshot.toString('base64')
|
||||
}
|
||||
}
|
||||
])
|
||||
return result.response.text().trim()
|
||||
} catch (e) {
|
||||
return `(Gemini narration failed: ${e instanceof Error ? e.message.slice(0, 50) : e})`
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main reproduce function ──
|
||||
|
||||
export async function runReproducePhase(
|
||||
opts: ReproduceOptions
|
||||
): Promise<NarrationLog> {
|
||||
const { page, plan, geminiApiKey, outputDir } = opts
|
||||
const { executeAction } = await import('./qa-record.js')
|
||||
|
||||
const narrationDir = `${outputDir}/narration`
|
||||
mkdirSync(narrationDir, { recursive: true })
|
||||
|
||||
const entries: NarrationEntry[] = []
|
||||
const startMs = Date.now()
|
||||
|
||||
console.warn(`Reproduce phase: replaying ${plan.length} steps...`)
|
||||
|
||||
for (let i = 0; i < plan.length; i++) {
|
||||
const step = plan[i]
|
||||
const actionObj = step.action
|
||||
const elapsed = Date.now() - startMs
|
||||
|
||||
// Show subtitle
|
||||
await showSubtitle(page, `Step ${i + 1}: ${actionObj.action}`, i + 1)
|
||||
console.warn(` [${i + 1}/${plan.length}] ${actionObj.action}`)
|
||||
|
||||
// Capture a11y BEFORE
|
||||
const a11yBefore = await page.accessibility.snapshot().catch(() => null)
|
||||
|
||||
// Execute action
|
||||
const result = await executeAction(
|
||||
page,
|
||||
actionObj as Parameters<typeof executeAction>[1],
|
||||
outputDir
|
||||
)
|
||||
await new Promise((r) => setTimeout(r, 500))
|
||||
|
||||
// Capture a11y AFTER
|
||||
const a11yAfter = await page.accessibility.snapshot().catch(() => null)
|
||||
|
||||
// Check assertion
|
||||
let assertionPassed = false
|
||||
let assertionActual = ''
|
||||
if (step.expectedAssertion) {
|
||||
// Parse the expected assertion — e.g. "Settings dialog: visible" or "tab count: 2"
|
||||
const parts = step.expectedAssertion.split(':').map((s) => s.trim())
|
||||
const selectorName = parts[0]
|
||||
const expectedState = parts.slice(1).join(':').trim()
|
||||
|
||||
const found = searchA11y(a11yAfter as A11yNode | null, selectorName)
|
||||
assertionActual = found ? summarizeA11y(found) : 'NOT FOUND'
|
||||
|
||||
if (expectedState === 'visible' || expectedState === 'exists') {
|
||||
assertionPassed = found !== null
|
||||
} else if (expectedState === 'hidden' || expectedState === 'gone') {
|
||||
assertionPassed = found === null
|
||||
} else {
|
||||
// Generic: check if the actual state contains the expected text
|
||||
assertionPassed = assertionActual
|
||||
.toLowerCase()
|
||||
.includes(expectedState.toLowerCase())
|
||||
}
|
||||
|
||||
console.warn(
|
||||
` Assertion: "${step.expectedAssertion}" → ${assertionPassed ? '✓ PASS' : '✗ FAIL'} (actual: ${assertionActual})`
|
||||
)
|
||||
}
|
||||
|
||||
// Gemini narration (visual description for humans)
|
||||
const geminiNarration = await geminiDescribe(
|
||||
page,
|
||||
geminiApiKey,
|
||||
`What changed after ${actionObj.action}?`
|
||||
)
|
||||
|
||||
entries.push({
|
||||
step: i + 1,
|
||||
action: actionObj.action,
|
||||
params: actionObj,
|
||||
result,
|
||||
a11yBefore,
|
||||
a11yAfter,
|
||||
assertionExpected: step.expectedAssertion,
|
||||
assertionPassed,
|
||||
assertionActual,
|
||||
geminiNarration,
|
||||
timestampMs: elapsed
|
||||
})
|
||||
}
|
||||
|
||||
// Final screenshot
|
||||
await page.screenshot({ path: `${outputDir}/reproduce-final.png` })
|
||||
|
||||
const log: NarrationLog = {
|
||||
entries,
|
||||
allAssertionsPassed: entries.every((e) => e.assertionPassed)
|
||||
}
|
||||
|
||||
writeFileSync(
|
||||
`${narrationDir}/narration-log.json`,
|
||||
JSON.stringify(log, null, 2)
|
||||
)
|
||||
console.warn(
|
||||
`Reproduce phase complete: ${entries.filter((e) => e.assertionPassed).length}/${entries.length} assertions passed`
|
||||
)
|
||||
|
||||
return log
|
||||
}
|
||||
Reference in New Issue
Block a user