feat: three-phase QA pipeline — Research → Reproduce → Report

Phase 1 (qa-agent.ts): Claude investigates via a11y API only. - No video, no Gemini vision — only page.accessibility.snapshot() - Every action logged with a11y before/after state - done() requires evidence citing inspect() results - Outputs reproduction plan for Phase 2 Phase 2 (qa-reproduce.ts): Deterministic replay of research plan. - Executes each step with a11y assertions - Gemini describes visual changes (narration for humans) - Clean focused video with subtitles Phase 3: Report job reads research-log.json for verdict (ground truth), narration-log.json for descriptions, video for visuals. Gemini formats logs into report — never determines verdict. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-20 06:20:11 +00:00 · 2026-03-28 18:17:31 +00:00
parent 6044452b8f
commit 3a27263ca6
3 changed files with 493 additions and 399 deletions
--- a/scripts/qa-agent.ts
+++ b/scripts/qa-agent.ts
@@ -1,166 +1,57 @@
 #!/usr/bin/env tsx
 /**
- * Hybrid QA Agent — Claude Sonnet 4.6 brain + Gemini 3.1 Pro eyes
+ * QA Research Phase — Claude Sonnet 4.6 investigates via a11y API
 *
- * Claude plans and reasons. Gemini watches the video buffer and describes
- * what it sees. The agent uses 4 tools:
- *   - observe(seconds, focus) — Gemini reviews last N seconds of video
- *   - inspect(selector) — search accessibility tree for element state
- *   - perform(action, params) — execute Playwright action
- *   - done(verdict, summary) — finish with result
+ * Claude explores the UI using accessibility tree assertions as ground truth.
+ * NO video, NO Gemini vision — only DOM state via page.accessibility.snapshot().
+ *
+ * Tools:
+ *   - inspect(selector) — search a11y tree for element state (source of truth)
+ *   - perform(action, params) — execute Playwright action + auto-log a11y before/after
+ *   - done(verdict, summary, reproductionPlan) — finish with evidence-backed conclusion
 */

 import type { Page } from '@playwright/test'
 import { query, tool, createSdkMcpServer } from '@anthropic-ai/claude-agent-sdk'
-import { GoogleGenerativeAI } from '@google/generative-ai'
 import { z } from 'zod'
-import { execSync } from 'child_process'
-import { mkdirSync, writeFileSync, readFileSync } from 'fs'
+import { mkdirSync, writeFileSync } from 'fs'

 // ── Types ──

-interface AgentOptions {
+interface ResearchOptions {
  page: Page
  issueContext: string
  qaGuide: string
  outputDir: string
-  geminiApiKey: string
-  anthropicApiKey?: string // Optional — Agent SDK auto-detects Claude Code session
+  anthropicApiKey?: string
  maxTurns?: number
  timeBudgetMs?: number
 }

-interface ScreenshotFrame {
+export interface ResearchTurn {
+  turn: number
  timestampMs: number
-  base64: string
+  toolName: string
+  toolInput: unknown
+  toolResult: string
+  a11yBefore?: unknown
+  a11yAfter?: unknown
 }

-// ── Video buffer ──
-
-const FRAME_INTERVAL_MS = 2000
-const MAX_BUFFER_FRAMES = 30 // 60 seconds at 2fps
-
-class VideoBuffer {
-  private frames: ScreenshotFrame[] = []
-  private startMs = Date.now()
-  private intervalId: ReturnType<typeof setInterval> | null = null
-  private page: Page
-
-  constructor(page: Page) {
-    this.page = page
-  }
-
-  start() {
-    this.startMs = Date.now()
-    this.intervalId = setInterval(async () => {
-      try {
-        const buf = await this.page.screenshot({
-          type: 'jpeg',
-          quality: 60
-        })
-        this.frames.push({
-          timestampMs: Date.now() - this.startMs,
-          base64: buf.toString('base64')
-        })
-        if (this.frames.length > MAX_BUFFER_FRAMES) {
-          this.frames.shift()
-        }
-      } catch {
-        // page may be navigating
-      }
-    }, FRAME_INTERVAL_MS)
-  }
-
-  stop() {
-    if (this.intervalId) clearInterval(this.intervalId)
-  }
-
-  getLastFrames(seconds: number): ScreenshotFrame[] {
-    const cutoffMs = Date.now() - this.startMs - seconds * 1000
-    return this.frames.filter((f) => f.timestampMs >= cutoffMs)
-  }
-
-  async buildVideoClip(
-    seconds: number,
-    outputDir: string
-  ): Promise<Buffer | null> {
-    const frames = this.getLastFrames(seconds)
-    if (frames.length < 2) return null
-
-    const clipDir = `${outputDir}/.clip-frames`
-    mkdirSync(clipDir, { recursive: true })
-
-    // Write frames as numbered JPEGs
-    for (let i = 0; i < frames.length; i++) {
-      writeFileSync(
-        `${clipDir}/frame-${String(i).padStart(4, '0')}.jpg`,
-        Buffer.from(frames[i].base64, 'base64')
-      )
-    }
-
-    // Compose into video with ffmpeg
-    const clipPath = `${outputDir}/.observe-clip.mp4`
-    try {
-      const fps = Math.max(1, Math.round(frames.length / seconds))
-      execSync(
-        `ffmpeg -y -framerate ${fps} -i "${clipDir}/frame-%04d.jpg" ` +
-          `-c:v libx264 -preset ultrafast -pix_fmt yuv420p "${clipPath}" 2>/dev/null`,
-        { timeout: 10000 }
-      )
-      return readFileSync(clipPath)
-    } catch {
-      return null
-    }
-  }
+export interface ReproductionStep {
+  action: Record<string, unknown> & { action: string }
+  expectedAssertion: string
 }

-// ── Gemini Vision ──
-
-async function geminiObserve(
-  videoBuffer: VideoBuffer,
-  seconds: number,
-  focus: string,
-  outputDir: string,
-  geminiApiKey: string
-): Promise<string> {
-  const genAI = new GoogleGenerativeAI(geminiApiKey)
-  const model = genAI.getGenerativeModel({
-    model: 'gemini-3-flash-preview'
-  })
-
-  // Try video clip first, fall back to last frame
-  const clip = await videoBuffer.buildVideoClip(seconds, outputDir)
-
-  const parts: Array<
-    { text: string } | { inlineData: { mimeType: string; data: string } }
-  > = [
-    {
-      text: `You are observing a ComfyUI frontend session. Focus on: ${focus}\n\nDescribe what happened in the last ${seconds} seconds. Be specific about UI state, actions taken, and results.`
-    }
-  ]
-
-  if (clip) {
-    parts.push({
-      inlineData: { mimeType: 'video/mp4', data: clip.toString('base64') }
-    })
-  } else {
-    // Fall back to last frame
-    const frames = videoBuffer.getLastFrames(seconds)
-    if (frames.length > 0) {
-      parts.push({
-        inlineData: {
-          mimeType: 'image/jpeg',
-          data: frames[frames.length - 1].base64
-        }
-      })
-    }
-  }
-
-  const result = await model.generateContent(parts)
-  return result.response.text().trim()
+export interface ResearchResult {
+  verdict: 'REPRODUCED' | 'NOT_REPRODUCIBLE' | 'INCONCLUSIVE'
+  summary: string
+  evidence: string
+  reproductionPlan: ReproductionStep[]
+  log: ResearchTurn[]
 }

-// ── Accessibility tree helpers ──
+// ── A11y helpers ──

 interface A11yNode {
  role: string
@@ -174,16 +65,12 @@ interface A11yNode {
 function searchA11y(node: A11yNode | null, selector: string): A11yNode | null {
  if (!node) return null
  const sel = selector.toLowerCase()
-
-  // Match by name or role
  if (
    node.name?.toLowerCase().includes(sel) ||
    node.role?.toLowerCase().includes(sel)
  ) {
    return node
  }
-
-  // Recurse into children
  if (node.children) {
    for (const child of node.children) {
      const found = searchA11y(child, selector)
@@ -213,174 +100,77 @@ function flattenA11y(node: A11yNode | null, depth = 0): string {
  return parts.filter(Boolean).join('\n')
 }

-// ── Subtitle overlay ──
+// ── Main research function ──

-async function showSubtitle(page: Page, text: string, turn: number) {
-  const encoded = encodeURIComponent(
-    text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
-  )
-  await page.addScriptTag({
-    content: `(function(){
-      var id='qa-subtitle';
-      var el=document.getElementById(id);
-      if(!el){
-        el=document.createElement('div');
-        el.id=id;
-        Object.assign(el.style,{position:'fixed',bottom:'32px',left:'50%',transform:'translateX(-50%)',zIndex:'2147483646',maxWidth:'90%',padding:'6px 14px',borderRadius:'6px',background:'rgba(0,0,0,0.8)',color:'rgba(255,255,255,0.95)',fontSize:'12px',fontFamily:'system-ui,sans-serif',fontWeight:'400',lineHeight:'1.4',pointerEvents:'none',textAlign:'center',transition:'opacity 0.3s',whiteSpace:'normal'});
-        document.body.appendChild(el);
-      }
-      var msg=decodeURIComponent('${encoded}');
-      el.textContent='['+${turn}+'] '+msg;
-      el.style.opacity='1';
-    })()`
-  })
-}
+export async function runResearchPhase(
+  opts: ResearchOptions
+): Promise<ResearchResult> {
+  const { page, issueContext, qaGuide, outputDir, anthropicApiKey } = opts
+  const maxTurns = opts.maxTurns ?? 40
+  const timeBudgetMs = opts.timeBudgetMs ?? 180_000

-// ── Main agent ──
-
-export async function runHybridAgent(opts: AgentOptions): Promise<{
-  verdict: string
-  summary: string
-}> {
-  const {
-    page,
-    issueContext,
-    qaGuide,
-    outputDir,
-    geminiApiKey,
-    anthropicApiKey
-  } = opts
-  const maxTurns = opts.maxTurns ?? 30
-  const timeBudgetMs = opts.timeBudgetMs ?? 120_000
-
-  // Start video buffer
-  const videoBuffer = new VideoBuffer(page)
-  videoBuffer.start()
-
-  let lastA11ySnapshot: A11yNode | null = null
  let agentDone = false
-  let finalVerdict = 'INCONCLUSIVE'
+  let finalVerdict: ResearchResult['verdict'] = 'INCONCLUSIVE'
  let finalSummary = 'Agent did not complete'
+  let finalEvidence = ''
+  let finalPlan: ReproductionStep[] = []
  let turnCount = 0
  const startTime = Date.now()
+  const researchLog: ResearchTurn[] = []

-  // Import executeAction from qa-record.ts (shared Playwright helpers)
-  // For now, inline the action execution
  const { executeAction } = await import('./qa-record.js')

-  // Define tools
-  const observeTool = tool(
-    'observe',
-    'Watch the last N seconds of screen recording through Gemini vision. Use this to verify visual state, check if actions had visible effect, or inspect visual bugs. Pass a focused question so Gemini knows what to look for.',
-    {
-      seconds: z
-        .number()
-        .min(3)
-        .max(60)
-        .default(10)
-        .describe('How many seconds to look back'),
-      focus: z
-        .string()
-        .describe(
-          'What to look for — be specific, e.g. "Did the Nodes 2.0 toggle switch to ON?"'
-        )
-    },
-    async (args) => {
-      const description = await geminiObserve(
-        videoBuffer,
-        args.seconds,
-        args.focus,
-        outputDir,
-        geminiApiKey
-      )
-      return { content: [{ type: 'text' as const, text: description }] }
-    }
-  )
-
+  // ── Tool: inspect ──
  const inspectTool = tool(
    'inspect',
-    'Search the accessibility tree for a specific UI element. Returns its role, name, value, checked state. Fast and precise — use this to verify element state without vision.',
+    'Search the accessibility tree for a UI element. Returns role, name, value, checked state. This is your SOURCE OF TRUTH — use it after every action to verify state.',
    {
      selector: z
        .string()
        .describe(
-          'Element name or role to search for, e.g. "Nodes 2.0", "KSampler seed", "Run button"'
+          'Element name or role to search for, e.g. "Settings", "Language", "KSampler seed", "tab"'
        )
    },
    async (args) => {
-      try {
-        const snapshot =
-          (await page.accessibility.snapshot()) as A11yNode | null
-        lastA11ySnapshot = snapshot
-        const found = searchA11y(snapshot, args.selector)
-        if (found) {
-          return {
-            content: [
-              {
-                type: 'text' as const,
-                text: JSON.stringify({
-                  role: found.role,
-                  name: found.name,
-                  value: found.value,
-                  checked: found.checked,
-                  disabled: found.disabled,
-                  hasChildren: Boolean(found.children?.length)
-                })
-              }
-            ]
-          }
-        }
-        // Return nearby elements if exact match not found
-        const tree = flattenA11y(snapshot, 0).slice(0, 2000)
-        return {
-          content: [
-            {
-              type: 'text' as const,
-              text: `Element "${args.selector}" not found. Available elements:\n${tree}`
-            }
-          ]
-        }
-      } catch (e) {
-        return {
-          content: [
-            {
-              type: 'text' as const,
-              text: `inspect failed: ${e instanceof Error ? e.message : e}`
-            }
-          ]
-        }
-      }
+      const snapshot = (await page.accessibility.snapshot()) as A11yNode | null
+      const found = searchA11y(snapshot, args.selector)
+
+      const resultText = found
+        ? JSON.stringify({
+            role: found.role,
+            name: found.name,
+            value: found.value,
+            checked: found.checked,
+            disabled: found.disabled,
+            hasChildren: Boolean(found.children?.length)
+          })
+        : `Element "${args.selector}" not found. Available:\n${flattenA11y(snapshot, 0).slice(0, 2000)}`
+
+      researchLog.push({
+        turn: turnCount,
+        timestampMs: Date.now() - startTime,
+        toolName: 'inspect',
+        toolInput: args,
+        toolResult: resultText.slice(0, 500)
+      })
+
+      return { content: [{ type: 'text' as const, text: resultText }] }
    }
  )

+  // ── Tool: perform ──
  const performTool = tool(
    'perform',
-    `Execute a Playwright action on the ComfyUI page. Available actions:
- click(text): click element by visible text
- clickCanvas(x, y): click at coordinates
- rightClickCanvas(x, y): right-click at coordinates
- doubleClick(x, y): double-click at coordinates
- dragCanvas(fromX, fromY, toX, toY): drag between points
- scrollCanvas(x, y, deltaY): scroll wheel (negative=zoom in)
- pressKey(key): press keyboard key (Escape, Enter, Delete, Control+c, etc.)
- fillDialog(text): fill input and press Enter
- openMenu(): open hamburger menu
- hoverMenuItem(label): hover menu item
- clickMenuItem(label): click submenu item
- setSetting(id, value): change a ComfyUI setting
- loadDefaultWorkflow(): load the 7-node default workflow
- openSettings(): open Settings dialog
- reload(): reload the page
- addNode(nodeName, x, y): add a node via search
- copyPaste(x, y): Ctrl+C then Ctrl+V at coords
- holdKeyAndDrag(key, fromX, fromY, toX, toY): hold key while dragging
- screenshot(name): take a named screenshot`,
+    `Execute a Playwright action. Auto-captures a11y state before and after.
+Available: click(text), clickCanvas(x,y), rightClickCanvas(x,y), doubleClick(x,y),
+dragCanvas(fromX,fromY,toX,toY), scrollCanvas(x,y,deltaY), pressKey(key),
+fillDialog(text), openMenu(), hoverMenuItem(label), clickMenuItem(label),
+setSetting(id,value), loadDefaultWorkflow(), openSettings(), reload(),
+addNode(nodeName,x,y), copyPaste(x,y), holdKeyAndDrag(key,fromX,fromY,toX,toY),
+screenshot(name)`,
    {
      action: z.string().describe('Action name'),
-      params: z
-        .record(z.unknown())
-        .optional()
-        .describe('Action parameters as key-value pairs')
+      params: z.record(z.unknown()).optional().describe('Action parameters')
    },
    async (args) => {
      turnCount++
@@ -389,152 +179,175 @@ export async function runHybridAgent(opts: AgentOptions): Promise<{
          content: [
            {
              type: 'text' as const,
-              text: `Budget exceeded (${turnCount}/${maxTurns} turns, ${Math.round((Date.now() - startTime) / 1000)}s). Use done() now.`
+              text: `Budget exceeded (${turnCount}/${maxTurns} turns, ${Math.round((Date.now() - startTime) / 1000)}s). Call done() NOW with your current findings.`
            }
          ]
        }
      }

-      // Build TestAction object from args
-      const actionObj = { action: args.action, ...args.params } as Parameters<
-        typeof executeAction
-      >[1]
+      // Capture a11y BEFORE
+      const a11yBefore = await page.accessibility.snapshot().catch(() => null)

+      const actionObj = {
+        action: args.action,
+        ...args.params
+      } as Parameters<typeof executeAction>[1]
+
+      let resultText: string
      try {
        const result = await executeAction(page, actionObj, outputDir)
-        // Show subtitle
-        await showSubtitle(
-          page,
-          `${args.action}: ${result.success ? 'OK' : result.error}`,
-          turnCount
-        )
-        return {
-          content: [
-            {
-              type: 'text' as const,
-              text: result.success
-                ? `Action "${args.action}" succeeded.`
-                : `Action "${args.action}" FAILED: ${result.error}`
-            }
-          ]
-        }
+        resultText = result.success
+          ? `Action "${args.action}" succeeded.`
+          : `Action "${args.action}" FAILED: ${result.error}`
      } catch (e) {
-        return {
-          content: [
-            {
-              type: 'text' as const,
-              text: `Action "${args.action}" threw: ${e instanceof Error ? e.message : e}`
-            }
-          ]
-        }
+        resultText = `Action "${args.action}" threw: ${e instanceof Error ? e.message : e}`
      }
-    }
-  )

-  const doneTool = tool(
-    'done',
-    'Signal that reproduction is complete. Call this when you have either confirmed the bug or determined it cannot be reproduced.',
-    {
-      verdict: z
-        .enum(['REPRODUCED', 'NOT_REPRODUCIBLE', 'INCONCLUSIVE'])
-        .describe('Final verdict'),
-      summary: z
-        .string()
-        .describe(
-          'One paragraph: what you did, what you observed, and why you reached this verdict'
-        )
-    },
-    async (args) => {
-      agentDone = true
-      finalVerdict = args.verdict
-      finalSummary = args.summary
-      await showSubtitle(page, `DONE: ${args.verdict}`, turnCount)
+      // Capture a11y AFTER
+      const a11yAfter = await page.accessibility.snapshot().catch(() => null)
+
+      researchLog.push({
+        turn: turnCount,
+        timestampMs: Date.now() - startTime,
+        toolName: 'perform',
+        toolInput: args,
+        toolResult: resultText,
+        a11yBefore,
+        a11yAfter
+      })
+
      return {
        content: [
          {
            type: 'text' as const,
-            text: `Agent finished: ${args.verdict}`
+            text: `${resultText}\n\n(a11y state captured before/after — use inspect() to verify specific elements)`
          }
        ]
      }
    }
  )

-  // Create MCP server with our tools
+  // ── Tool: done ──
+  const doneTool = tool(
+    'done',
+    'Finish the research with an evidence-backed verdict and a reproduction plan.',
+    {
+      verdict: z
+        .enum(['REPRODUCED', 'NOT_REPRODUCIBLE', 'INCONCLUSIVE'])
+        .describe('Final verdict — MUST be supported by inspect() evidence'),
+      summary: z
+        .string()
+        .describe(
+          'What you did, what inspect() showed, and why you reached this verdict'
+        ),
+      evidence: z
+        .string()
+        .describe(
+          'Cite specific inspect() results: "inspect(X) returned {Y} proving Z"'
+        ),
+      reproductionPlan: z
+        .array(
+          z.object({
+            action: z
+              .record(z.unknown())
+              .describe('Action object with "action" field + params'),
+            expectedAssertion: z
+              .string()
+              .describe(
+                'Expected a11y state after this action, e.g. "Settings dialog: visible" or "tab count: 2"'
+              )
+          })
+        )
+        .describe(
+          'Minimal ordered steps to reproduce the bug. Empty if NOT_REPRODUCIBLE/INCONCLUSIVE.'
+        )
+    },
+    async (args) => {
+      agentDone = true
+      finalVerdict = args.verdict
+      finalSummary = args.summary
+      finalEvidence = args.evidence
+      finalPlan = args.reproductionPlan.map((s) => ({
+        action: s.action as ReproductionStep['action'],
+        expectedAssertion: s.expectedAssertion
+      }))
+
+      return {
+        content: [
+          {
+            type: 'text' as const,
+            text: `Research complete: ${args.verdict}`
+          }
+        ]
+      }
+    }
+  )
+
+  // ── MCP Server ──
  const server = createSdkMcpServer({
-    name: 'qa-agent',
+    name: 'qa-research',
    version: '1.0.0',
-    tools: [observeTool, inspectTool, performTool, doneTool]
+    tools: [inspectTool, performTool, doneTool]
  })

-  // Build system prompt
-  const systemPrompt = `You are a senior QA engineer reproducing a reported bug in ComfyUI, a node-based AI image generation tool.
+  // ── System prompt ──
+  const systemPrompt = `You are a senior QA engineer investigating a reported bug in ComfyUI.

-## Your tools
- observe(seconds, focus) — Gemini AI watches the last N seconds of screen recording and answers your focused question. Use for visual verification.
- inspect(selector) — Search the accessibility tree for a specific element's state. Use for precise state checks (toggle on/off, value, disabled).
- perform(action, params) — Execute a Playwright action on the browser.
- done(verdict, summary) — Finish with your conclusion.
+## Your tools (3 only — no vision)
+- inspect(selector) — Search accessibility tree for element state. THIS IS YOUR SOURCE OF TRUTH.
+- perform(action, params) — Execute a Playwright action. Auto-captures a11y before/after.
+- done(verdict, summary, evidence, reproductionPlan) — Finish with evidence-backed conclusion.
+
+## Rules (CRITICAL)
+1. After EVERY perform() call, use inspect() to verify the DOM state changed as expected.
+2. Your verdict MUST cite specific inspect() results as evidence.
+3. NEVER claim REPRODUCED unless inspect() confirms the broken state.
+4. NEVER claim NOT_REPRODUCIBLE unless you actually performed all the reproduction steps and inspect() shows normal behavior.
+5. If you run out of time before completing steps, verdict is INCONCLUSIVE.
+6. Complete ALL reproduction steps. Setup (loading workflow, opening settings) is NOT reproduction — the actual bug trigger is.
+
+## Output
+When you call done(), include:
+- verdict: based on what inspect() showed, not what you expected
+- evidence: "inspect('Settings dialog') returned {role: dialog, name: Settings} — dialog still visible after Escape, proving the bug does NOT exist on this build"
+- reproductionPlan: minimal steps that demonstrate the bug (for the reproduce phase to replay as a clean video)

 ## Strategy
-1. Start by understanding the issue, then plan your FULL reproduction sequence before acting.
-2. Use perform() to take actions. After EVERY action, use inspect() to verify the DOM state changed as expected.
-3. If a setting change doesn't seem to take effect, try reload() then verify again.
-4. Focus on the specific bug — don't explore randomly.
-5. Take screenshots at key moments for the video evidence.
-6. When you've confirmed or ruled out the bug, call done().
-7. You MUST complete ALL reproduction steps. Do NOT stop after setup — the actual bug trigger is the most important part.
+1. Read the issue carefully. Plan the FULL reproduction sequence.
+2. Set up prerequisites (load workflow, open settings, etc.)
+3. Perform the actual bug trigger (the specific interaction described in the issue)
+4. Verify the result with inspect() — is the state broken or correct?
+5. If the bug is triggered by a setting/mode, do control/test comparison:
+   - CONTROL: perform action with setting OFF → inspect() → should work
+   - TEST: perform action with setting ON → inspect() → should break

-## Verification Rules (CRITICAL — prevents false results)
- NEVER claim REPRODUCED unless inspect() confirms the expected broken state exists
- After EVERY action, call inspect() to verify the DOM state. This is your source of truth.
- Your done() verdict MUST be supported by inspect() results, not by what you think happened
- If you perform an action but inspect() shows no state change, the action FAILED — try again or adapt
- Example: if you press Escape and inspect("Settings dialog") still returns visible → the dialog did NOT close → report that honestly
- ALWAYS include inspect() evidence in your done() summary: "inspect('X') returned {state} confirming Y"
-
-## Control/Test Comparison (IMPORTANT)
-When a bug is triggered by a specific setting, mode, or configuration:
-1. **CONTROL phase**: First demonstrate the WORKING state. Disable the trigger (e.g., Nodes 2.0 OFF), perform the action, take a screenshot labeled "control-*", verify it works.
-2. **TEST phase**: Then enable the trigger (e.g., Nodes 2.0 ON), reload if needed, perform the SAME action, take a screenshot labeled "test-*", verify it's broken.
-3. In your done() summary, explicitly compare: "With X OFF, behavior was Y. With X ON, behavior was Z."
-
-This contrast is critical evidence — it proves the bug is caused by the specific setting, not a general issue. Always try to show both states when possible.
-
-Examples of control/test pairs:
- Nodes 2.0 OFF → ON (for node rendering, widget, drag bugs)
- Default theme → specific theme (for visual bugs)
- Single node → multiple overlapping nodes (for z-index bugs)
- Empty workflow → loaded workflow (for state bugs)
-
-## ComfyUI Layout (1280×720 viewport)
- Canvas with node graph centered at ~(640, 400)
- Hamburger menu top-left (C logo)
+## ComfyUI Layout (1280×720)
+- Canvas centered at ~(640, 400)
+- Hamburger menu (top-left C logo) → File, Edit, View, Theme, Help
 - Sidebar: Workflows, Node Library, Models
- Default workflow nodes: Load Checkpoint (~150,300), CLIP Text Encode (~450,250/450), Empty Latent (~450,600), KSampler (~750,350), VAE Decode (~1000,350), Save Image (~1200,350)
+- Default workflow: Load Checkpoint (~150,300), CLIP Text Encode (~450,250/450), KSampler (~750,350)

 ${qaGuide ? `## QA Guide\n${qaGuide}\n` : ''}
 ## Issue to Reproduce
 ${issueContext}`

-  // Run the agent
-  console.warn('Starting hybrid agent (Claude Sonnet 4.6 + Gemini vision)...')
+  // ── Run the agent ──
+  console.warn('Starting research phase (Claude + a11y)...')

  try {
    for await (const message of query({
      prompt:
-        'Reproduce the reported bug. Start by reading the issue context in your system prompt, then use your tools to interact with the ComfyUI browser session.',
+        'Investigate the reported bug. Use inspect() after every action to verify state. When done, call done() with evidence from inspect() results and a reproduction plan.',
      options: {
        model: 'claude-sonnet-4-6',
        systemPrompt,
        ...(anthropicApiKey ? { apiKey: anthropicApiKey } : {}),
        maxTurns,
-        mcpServers: { 'qa-agent': server },
+        mcpServers: { 'qa-research': server },
        allowedTools: [
-          'mcp__qa-agent__observe',
-          'mcp__qa-agent__inspect',
-          'mcp__qa-agent__perform',
-          'mcp__qa-agent__done'
+          'mcp__qa-research__inspect',
+          'mcp__qa-research__perform',
+          'mcp__qa-research__done'
        ]
      }
    })) {
@@ -553,10 +366,26 @@ ${issueContext}`
      if (agentDone) break
    }
  } catch (e) {
-    console.warn(`Agent error: ${e instanceof Error ? e.message : e}`)
+    console.warn(`Research error: ${e instanceof Error ? e.message : e}`)
  }

-  videoBuffer.stop()
+  // Save research log
+  const result: ResearchResult = {
+    verdict: finalVerdict,
+    summary: finalSummary,
+    evidence: finalEvidence,
+    reproductionPlan: finalPlan,
+    log: researchLog
+  }

-  return { verdict: finalVerdict, summary: finalSummary }
+  mkdirSync(`${outputDir}/research`, { recursive: true })
+  writeFileSync(
+    `${outputDir}/research/research-log.json`,
+    JSON.stringify(result, null, 2)
+  )
+  console.warn(
+    `Research complete: ${finalVerdict} (${researchLog.length} tool calls, ${finalPlan.length} reproduction steps)`
+  )
+
+  return result
 }
--- a/scripts/qa-record.ts
+++ b/scripts/qa-record.ts
@@ -429,7 +429,7 @@ interface NarrationSegment {

 // Collected during recording, used for TTS post-processing
 const narrationSegments: NarrationSegment[] = []
-let recordingStartMs = 0
+const recordingStartMs = 0

 async function showSubtitle(page: Page, text: string, turn: number) {
  const safeText = text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
@@ -1934,14 +1934,10 @@ async function main() {
        await page.screenshot({
          path: `${opts.outputDir}/debug-after-login-reproduce${sessionLabel}.png`
        })
-        console.warn('Editor ready — starting agentic loop')
-        recordingStartMs = Date.now()
-        narrationSegments.length = 0
-
-        // Hybrid agent (Claude + Gemini)
-        // Uses ANTHROPIC_API_KEY in CI, or Claude Code OAuth session locally
+        // ═══ Phase 1: RESEARCH (Claude + a11y — no video needed) ═══
+        console.warn('Phase 1: Research — Claude investigates via a11y API')
        const anthropicKey = process.env.ANTHROPIC_API_KEY
-        const { runHybridAgent } = await import('./qa-agent.js')
+        const { runResearchPhase } = await import('./qa-agent.js')
        const issueCtx = opts.diffFile
          ? readFileSync(opts.diffFile, 'utf-8').slice(0, 6000)
          : 'No issue context provided'
@@ -1953,17 +1949,39 @@ async function main() {
            // QA guide not available
          }
        }
-        const result = await runHybridAgent({
+        const research = await runResearchPhase({
          page,
          issueContext: issueCtx,
          qaGuide: qaGuideText,
          outputDir: opts.outputDir,
-          geminiApiKey: opts.apiKey,
          anthropicApiKey: anthropicKey
        })
        console.warn(
-          `Hybrid agent finished: ${result.verdict} — ${result.summary.slice(0, 100)}`
+          `Research complete: ${research.verdict} — ${research.summary.slice(0, 100)}`
        )
+        console.warn(`Evidence: ${research.evidence.slice(0, 200)}`)
+        console.warn(
+          `Reproduction plan: ${research.reproductionPlan.length} steps`
+        )
+
+        // ═══ Phase 2: REPRODUCE (deterministic replay + narration) ═══
+        if (
+          research.verdict === 'REPRODUCED' &&
+          research.reproductionPlan.length > 0
+        ) {
+          console.warn('Phase 2: Reproduce — replaying plan with narration')
+          const { runReproducePhase } = await import('./qa-reproduce.js')
+          await runReproducePhase({
+            page,
+            plan: research.reproductionPlan,
+            geminiApiKey: opts.apiKey,
+            outputDir: opts.outputDir
+          })
+        } else {
+          console.warn(
+            `Skipping Phase 2: verdict=${research.verdict}, plan=${research.reproductionPlan.length} steps`
+          )
+        }
        await sleep(2000)
      } finally {
        await context.close()
--- a/scripts/qa-reproduce.ts
+++ b/scripts/qa-reproduce.ts
@@ -0,0 +1,247 @@
+#!/usr/bin/env tsx
+/**
+ * QA Reproduce Phase — Deterministic replay of research plan with narration
+ *
+ * Takes a reproduction plan from the research phase and replays it:
+ * 1. Execute each action deterministically (no AI decisions)
+ * 2. Capture a11y snapshot before/after each action
+ * 3. Gemini describes what visually changed (narration for humans)
+ * 4. Output: narration-log.json with full evidence chain
+ */
+
+import type { Page } from '@playwright/test'
+import { GoogleGenerativeAI } from '@google/generative-ai'
+import { mkdirSync, writeFileSync } from 'fs'
+
+import type { ActionResult } from './qa-record.js'
+
+// ── Types ──
+
+interface ReproductionStep {
+  action: Record<string, unknown> & { action: string }
+  expectedAssertion: string
+}
+
+interface NarrationEntry {
+  step: number
+  action: string
+  params: Record<string, unknown>
+  result: ActionResult
+  a11yBefore: unknown
+  a11yAfter: unknown
+  assertionExpected: string
+  assertionPassed: boolean
+  assertionActual: string
+  geminiNarration: string
+  timestampMs: number
+}
+
+export interface NarrationLog {
+  entries: NarrationEntry[]
+  allAssertionsPassed: boolean
+}
+
+interface ReproduceOptions {
+  page: Page
+  plan: ReproductionStep[]
+  geminiApiKey: string
+  outputDir: string
+}
+
+// ── A11y helpers ──
+
+interface A11yNode {
+  role: string
+  name: string
+  value?: string
+  checked?: boolean
+  disabled?: boolean
+  expanded?: boolean
+  children?: A11yNode[]
+}
+
+function searchA11y(node: A11yNode | null, selector: string): A11yNode | null {
+  if (!node) return null
+  const sel = selector.toLowerCase()
+  if (
+    node.name?.toLowerCase().includes(sel) ||
+    node.role?.toLowerCase().includes(sel)
+  ) {
+    return node
+  }
+  if (node.children) {
+    for (const child of node.children) {
+      const found = searchA11y(child, selector)
+      if (found) return found
+    }
+  }
+  return null
+}
+
+function summarizeA11y(node: A11yNode | null): string {
+  if (!node) return 'null'
+  const parts = [`role=${node.role}`, `name="${node.name}"`]
+  if (node.value !== undefined) parts.push(`value="${node.value}"`)
+  if (node.checked !== undefined) parts.push(`checked=${node.checked}`)
+  if (node.disabled) parts.push('disabled')
+  if (node.expanded !== undefined) parts.push(`expanded=${node.expanded}`)
+  return `{${parts.join(', ')}}`
+}
+
+// ── Subtitle overlay ──
+
+async function showSubtitle(page: Page, text: string, step: number) {
+  const encoded = encodeURIComponent(
+    text.slice(0, 120).replace(/'/g, "\\'").replace(/\n/g, ' ')
+  )
+  await page.addScriptTag({
+    content: `(function(){
+      var id='qa-subtitle';
+      var el=document.getElementById(id);
+      if(!el){
+        el=document.createElement('div');
+        el.id=id;
+        Object.assign(el.style,{position:'fixed',bottom:'32px',left:'50%',transform:'translateX(-50%)',zIndex:'2147483646',maxWidth:'90%',padding:'6px 14px',borderRadius:'6px',background:'rgba(0,0,0,0.8)',color:'rgba(255,255,255,0.95)',fontSize:'12px',fontFamily:'system-ui,sans-serif',fontWeight:'400',lineHeight:'1.4',pointerEvents:'none',textAlign:'center',whiteSpace:'normal'});
+        document.body.appendChild(el);
+      }
+      el.textContent='['+${step}+'] '+decodeURIComponent('${encoded}');
+    })()`
+  })
+}
+
+// ── Gemini visual narration ──
+
+async function geminiDescribe(
+  page: Page,
+  geminiApiKey: string,
+  focus: string
+): Promise<string> {
+  try {
+    const screenshot = await page.screenshot({ type: 'jpeg', quality: 70 })
+    const genAI = new GoogleGenerativeAI(geminiApiKey)
+    const model = genAI.getGenerativeModel({ model: 'gemini-3-flash-preview' })
+
+    const result = await model.generateContent([
+      {
+        text: `Describe in 1-2 sentences what you see on this ComfyUI screen. Focus on: ${focus}. Be factual — only describe what is visible.`
+      },
+      {
+        inlineData: {
+          mimeType: 'image/jpeg',
+          data: screenshot.toString('base64')
+        }
+      }
+    ])
+    return result.response.text().trim()
+  } catch (e) {
+    return `(Gemini narration failed: ${e instanceof Error ? e.message.slice(0, 50) : e})`
+  }
+}
+
+// ── Main reproduce function ──
+
+export async function runReproducePhase(
+  opts: ReproduceOptions
+): Promise<NarrationLog> {
+  const { page, plan, geminiApiKey, outputDir } = opts
+  const { executeAction } = await import('./qa-record.js')
+
+  const narrationDir = `${outputDir}/narration`
+  mkdirSync(narrationDir, { recursive: true })
+
+  const entries: NarrationEntry[] = []
+  const startMs = Date.now()
+
+  console.warn(`Reproduce phase: replaying ${plan.length} steps...`)
+
+  for (let i = 0; i < plan.length; i++) {
+    const step = plan[i]
+    const actionObj = step.action
+    const elapsed = Date.now() - startMs
+
+    // Show subtitle
+    await showSubtitle(page, `Step ${i + 1}: ${actionObj.action}`, i + 1)
+    console.warn(`  [${i + 1}/${plan.length}] ${actionObj.action}`)
+
+    // Capture a11y BEFORE
+    const a11yBefore = await page.accessibility.snapshot().catch(() => null)
+
+    // Execute action
+    const result = await executeAction(
+      page,
+      actionObj as Parameters<typeof executeAction>[1],
+      outputDir
+    )
+    await new Promise((r) => setTimeout(r, 500))
+
+    // Capture a11y AFTER
+    const a11yAfter = await page.accessibility.snapshot().catch(() => null)
+
+    // Check assertion
+    let assertionPassed = false
+    let assertionActual = ''
+    if (step.expectedAssertion) {
+      // Parse the expected assertion — e.g. "Settings dialog: visible" or "tab count: 2"
+      const parts = step.expectedAssertion.split(':').map((s) => s.trim())
+      const selectorName = parts[0]
+      const expectedState = parts.slice(1).join(':').trim()
+
+      const found = searchA11y(a11yAfter as A11yNode | null, selectorName)
+      assertionActual = found ? summarizeA11y(found) : 'NOT FOUND'
+
+      if (expectedState === 'visible' || expectedState === 'exists') {
+        assertionPassed = found !== null
+      } else if (expectedState === 'hidden' || expectedState === 'gone') {
+        assertionPassed = found === null
+      } else {
+        // Generic: check if the actual state contains the expected text
+        assertionPassed = assertionActual
+          .toLowerCase()
+          .includes(expectedState.toLowerCase())
+      }
+
+      console.warn(
+        `    Assertion: "${step.expectedAssertion}" → ${assertionPassed ? '✓ PASS' : '✗ FAIL'} (actual: ${assertionActual})`
+      )
+    }
+
+    // Gemini narration (visual description for humans)
+    const geminiNarration = await geminiDescribe(
+      page,
+      geminiApiKey,
+      `What changed after ${actionObj.action}?`
+    )
+
+    entries.push({
+      step: i + 1,
+      action: actionObj.action,
+      params: actionObj,
+      result,
+      a11yBefore,
+      a11yAfter,
+      assertionExpected: step.expectedAssertion,
+      assertionPassed,
+      assertionActual,
+      geminiNarration,
+      timestampMs: elapsed
+    })
+  }
+
+  // Final screenshot
+  await page.screenshot({ path: `${outputDir}/reproduce-final.png` })
+
+  const log: NarrationLog = {
+    entries,
+    allAssertionsPassed: entries.every((e) => e.assertionPassed)
+  }
+
+  writeFileSync(
+    `${narrationDir}/narration-log.json`,
+    JSON.stringify(log, null, 2)
+  )
+  console.warn(
+    `Reproduce phase complete: ${entries.filter((e) => e.assertionPassed).length}/${entries.length} assertions passed`
+  )
+
+  return log
+}