feat: use demowright for reproduce videos, combine E2E+video verdicts

- Replace hand-rolled cursor overlay and waitForTimeout injection with demowright's automatic HUD (cursor, keystroke display, 300ms delay) - Install demowright in both qa-before and qa-after CI jobs - Always check video review verdicts (not just as fallback) - Upgrade reproducedBy to 'both' when E2E and video agree - Prevent false REPRODUCED from discovery/debug tests - Ban discovery tests in agent system prompt Amp-Thread-ID: https://ampcode.com/threads/T-019d519b-004f-71ce-b970-96edd971fbe0 Co-authored-by: Amp <amp@ampcode.com>
2026-04-20 14:30:41 +00:00 · 2026-04-07 07:56:56 +00:00
parent e78ec93200
commit 08a5338941
6 changed files with 69 additions and 53 deletions
--- a/.github/workflows/pr-qa.yaml
+++ b/.github/workflows/pr-qa.yaml
@@ -10,6 +10,8 @@
 name: 'PR: QA'

 on:
+  push:
+    branches: [sno-qa-*]
  pull_request:
    types: [labeled]
    branches: [main]
@@ -90,6 +92,9 @@ jobs:
            NUM=$(gh pr list --repo "$REPO" \
              --head "$BRANCH" --state open \
              --json number --jq '.[0].number // empty')
+            if [ -z "$NUM" ]; then
+              NUM=$(echo "$BRANCH" | sed -n 's/^sno-qa-\([0-9]\+\)$/\1/p')
+            fi
          fi
          echo "number=${NUM}" >> "$GITHUB_OUTPUT"

@@ -198,7 +203,7 @@ jobs:
          include_build_step: false

      - name: Install QA dependencies
-        run: pnpm add -D @google/generative-ai@^0.24.1 @anthropic-ai/claude-agent-sdk@^0.2.85
+        run: pnpm add -D @google/generative-ai@^0.24.1 @anthropic-ai/claude-agent-sdk@^0.2.85 demowright@^0.1.0

      - name: Cache main branch dist
        id: cache-main-dist
@@ -374,7 +379,7 @@ jobs:
          include_build_step: true

      - name: Install QA dependencies
-        run: pnpm add -D @google/generative-ai@^0.24.1
+        run: pnpm add -D @google/generative-ai@^0.24.1 demowright@^0.1.0

      - name: Setup ComfyUI server (no launch)
        uses: ./.github/actions/setup-comfyui-server
@@ -473,7 +478,8 @@ jobs:
    needs: [resolve-matrix, analyze-pr, qa-before, qa-after]
    if: >-
      always() &&
-      (needs.qa-after.result == 'success' || needs.qa-before.result == 'success')
+      (needs.qa-after.result == 'success' || needs.qa-before.result == 'success') &&
+      (github.event.pull_request.number || github.event.issue.number || github.event_name == 'push' || github.event_name == 'workflow_dispatch')
    runs-on: ubuntu-latest
    permissions:
      contents: write
@@ -650,6 +656,7 @@ jobs:
          REPO: ${{ github.repository }}
        run: |
          NUM="${PR_NUM:-$(gh pr list --repo "$REPO" --head "$BRANCH" --state open --json number --jq '.[0].number // empty')}"
+          [ -z "$NUM" ] && NUM=$(echo "$BRANCH" | sed -n 's/^sno-qa-\([0-9]\+\)$/\1/p')
          echo "number=${NUM}" >> "$GITHUB_OUTPUT"

          if [ -n "$NUM" ]; then
--- a/docs/qa/TROUBLESHOOTING.md
+++ b/docs/qa/TROUBLESHOOTING.md
@@ -91,3 +91,9 @@
 2. Agent ran out of turn budget (30 turns / 120s)
 3. Gemini Flash (old agent) ignores prompt hints
   **Fix**: Use hybrid agent (Claude Sonnet 4.6 + Gemini vision). Claude's superior reasoning follows instructions precisely.
+
+## False REPRODUCED from Discovery Tests
+
+**Symptom**: Badge says REPRODUCED but the test was a debug/discovery test (e.g., "Find String Multiline node type") with trivial assertions like `expect(x).toBeDefined()`.
+**Cause**: The auto-complete logic saved any passing test as proof of reproduction, including discovery tests that pass trivially.
+**Fix**: Auto-complete now validates test code for bug-specific assertions — discovery tests (with names containing "Inspect", "Find", "Debug") and trivial assertions (`toBeDefined()`, `toBeGreaterThan(0)`) are excluded from auto-save.
--- a/knip.config.ts
+++ b/knip.config.ts
@@ -51,7 +51,8 @@ const config: KnipConfig = {
    '@primevue/icons',
    // QA pipeline deps — installed inline in CI workflow
    '@anthropic-ai/claude-agent-sdk',
-    '@google/generative-ai'
+    '@google/generative-ai',
+    'demowright'
  ],
  ignore: [
    // Auto generated API types
--- a/scripts/qa-agent.ts
+++ b/scripts/qa-agent.ts
@@ -291,11 +291,22 @@ export async function runResearchPhase(
        toolResult: resultText.slice(0, 1000)
      })

-      // Auto-save passing test code for fallback completion
+      // Auto-save passing test code for fallback completion — but only if
+      // the test contains a bug-specific assertion (not just a discovery/debug test)
      if (resultText.startsWith('TEST PASSED')) {
        try {
-          finalTestCode = readFileSync(browserTestPath, 'utf-8')
-          lastPassedTurn = turnCount
+          const code = readFileSync(browserTestPath, 'utf-8')
+          const hasBugAssertion =
+            /expect\s*\(/.test(code) &&
+            !/^\s*expect\([^)]+\)\.toBeDefined\(\)/m.test(code) &&
+            !/^\s*expect\([^)]+\)\.toBeGreaterThan\(0\)/m.test(code) &&
+            !/Inspect|Find|Debug|discover/i.test(
+              code.match(/test\(['"`]([^'"`]+)/)?.[1] ?? ''
+            )
+          if (hasBugAssertion) {
+            finalTestCode = code
+            lastPassedTurn = turnCount
+          }
        } catch {
          // ignore
        }
@@ -392,6 +403,7 @@ export async function runResearchPhase(
 - ALWAYS call done() when finished, even if the test passed — do not keep iterating after a passing test
 - Use \`expect.poll()\` for async assertions: \`await expect.poll(() => comfyPage.nodeOps.getGraphNodesCount()).toBe(8)\`
 - CRITICAL: Your assertions must be SPECIFIC TO THE BUG. A test that asserts \`expect(count).toBeGreaterThan(0)\` proves nothing — it would pass even without the bug. Instead assert the exact broken state, e.g. \`expect(clonedWidgets).toHaveLength(0)\` (missing widgets) or \`expect(zIndex).toBeLessThan(parentZIndex)\` (wrong z-order). If a test passes trivially, it's a false positive.
+- NEVER write "debug", "discovery", or "inspect node types" tests. These waste turns and produce false REPRODUCED verdicts. If you need to discover node type names, use inspect() or readFixture() — not a passing test.
 - If you cannot write a bug-specific assertion, call done() with verdict NOT_REPRODUCIBLE and explain why.

 ## ComfyPage Fixture API Reference
--- a/scripts/qa-deploy-pages.sh
+++ b/scripts/qa-deploy-pages.sh
@@ -253,22 +253,27 @@ if [ -f "$DEPLOY_DIR/research-log.json" ]; then
  fi
 fi

-# Fall back to video review verdicts if no research log
-if [ -z "$RESEARCH_VERDICT" ] && [ -d video-reviews ]; then
+# Check video review verdicts (always, not just as fallback)
+VIDEO_REPRODUCED=false
+if [ -d video-reviews ]; then
  for rpt in video-reviews/*-qa-video-report.md; do
    [ -f "$rpt" ] || continue
-    TOTAL_REPORTS=$((TOTAL_REPORTS + 1))
-    # Try structured JSON verdict first (from ## Verdict section)
    VERDICT_JSON=$(grep -oP '"verdict":\s*"[A-Z_]+' "$rpt" 2>/dev/null | tail -1 | grep -oP '[A-Z_]+$' || true)
-    RISK_JSON=$(grep -oP '"risk":\s*"[a-z]+' "$rpt" 2>/dev/null | tail -1 | grep -oP '[a-z]+$' || true)

    if [ -n "$VERDICT_JSON" ]; then
-      case "$VERDICT_JSON" in
-        REPRODUCED) REPRO_COUNT=$((REPRO_COUNT + 1)) ;;
-        NOT_REPRODUCIBLE) NOT_REPRO_COUNT=$((NOT_REPRO_COUNT + 1)) ;;
-        INCONCLUSIVE) INCONC_COUNT=$((INCONC_COUNT + 1)) ;;
-      esac
-    else
+      echo "Video review verdict: $VERDICT_JSON ($(basename "$rpt"))"
+      [ "$VERDICT_JSON" = "REPRODUCED" ] && VIDEO_REPRODUCED=true
+      # Only count video as separate report if no research log
+      if [ -z "$RESEARCH_VERDICT" ]; then
+        TOTAL_REPORTS=$((TOTAL_REPORTS + 1))
+        case "$VERDICT_JSON" in
+          REPRODUCED) REPRO_COUNT=$((REPRO_COUNT + 1)) ;;
+          NOT_REPRODUCIBLE) NOT_REPRO_COUNT=$((NOT_REPRO_COUNT + 1)) ;;
+          INCONCLUSIVE) INCONC_COUNT=$((INCONC_COUNT + 1)) ;;
+        esac
+      fi
+    elif [ -z "$RESEARCH_VERDICT" ]; then
+      TOTAL_REPORTS=$((TOTAL_REPORTS + 1))
      # Fallback: grep Summary section (for older reports without ## Verdict)
      SUMM=$(sed -n '/^## Summary/,/^## /p' "$rpt" 2>/dev/null | head -15)
      if echo "$SUMM" | grep -iq 'INCONCLUSIVE'; then
@@ -277,10 +282,19 @@ if [ -z "$RESEARCH_VERDICT" ] && [ -d video-reviews ]; then
        NOT_REPRO_COUNT=$((NOT_REPRO_COUNT + 1))
      elif echo "$SUMM" | grep -iq 'reproduc\|confirm'; then
        REPRO_COUNT=$((REPRO_COUNT + 1))
+        VIDEO_REPRODUCED=true
      fi
    fi
  done
 fi
+
+# Upgrade reproduction method to "both" when E2E and video agree
+if [ "$REPRO_METHOD" = "e2e_test" ] && [ "$VIDEO_REPRODUCED" = "true" ]; then
+  REPRO_METHOD="both"
+  echo "Upgraded reproducedBy to 'both' (E2E + video review agree)"
+elif [ -z "$RESEARCH_VERDICT" ] && [ "$VIDEO_REPRODUCED" = "true" ]; then
+  REPRO_METHOD="video"
+fi
 FAIL_COUNT=$((TOTAL_REPORTS - REPRO_COUNT - NOT_REPRO_COUNT))
 [ "$FAIL_COUNT" -lt 0 ] && FAIL_COUNT=0
 echo "DEBUG verdict: repro=${REPRO_COUNT} not_repro=${NOT_REPRO_COUNT} inconc=${INCONC_COUNT} fail=${FAIL_COUNT} total=${TOTAL_REPORTS}"
--- a/scripts/qa-record.ts
+++ b/scripts/qa-record.ts
@@ -1965,52 +1965,28 @@ async function main() {
        )
        console.warn(`Evidence: ${research.evidence.slice(0, 200)}`)

-        // ═══ Phase 2: Run passing test with video recording ═══
+        // ═══ Phase 2: Run passing test with demowright video recording ═══
        if (research.verdict === 'REPRODUCED' && research.testCode) {
-          console.warn('Phase 2: Recording test execution with video')
+          console.warn('Phase 2: Recording test execution with demowright')
          const projectRoot = process.cwd()
          const browserTestFile = `${projectRoot}/browser_tests/tests/qa-reproduce.spec.ts`
          const testResultsDir = `${opts.outputDir}/test-results`
-          // Inject cursor overlay into the test — add page.addInitScript in beforeEach
-          const cursorScript = `await comfyPage.page.addInitScript(() => {
-  var c=document.createElement('div');c.id='qa-cursor';
-  c.innerHTML='<svg width="20" height="20" viewBox="0 0 24 24" fill="white" stroke="black" stroke-width="1.5"><path d="M4 2l14 10-6.5 1.5L15 21l-3.5-1.5L8 21l-1.5-7.5L2 16z"/></svg>';
-  Object.assign(c.style,{position:'fixed',zIndex:'2147483647',pointerEvents:'none',width:'20px',height:'20px',margin:'-2px 0 0 -2px',opacity:'0.95'});
-  if(document.body)document.body.appendChild(c);
-  else document.addEventListener('DOMContentLoaded',function(){document.body.appendChild(c)});
-  document.addEventListener('mousemove',function(e){c.style.left=e.clientX+'px';c.style.top=e.clientY+'px'});
-});`
-          // Insert cursor injection after the first line of the test body (after async ({ comfyPage }) => {)
-          let testCode = research.testCode
-          const testBodyMatch = testCode.match(
-            /async\s*\(\{\s*comfyPage\s*\}\)\s*=>\s*\{/
-          )
-          if (testBodyMatch && testBodyMatch.index !== undefined) {
-            const insertPos = testBodyMatch.index + testBodyMatch[0].length
-            testCode =
-              testCode.slice(0, insertPos) +
-              '\n    ' +
-              cursorScript +
-              '\n' +
-              testCode.slice(insertPos)
-          }
-          // Inject 800ms pauses between actions for human-readable video
-          // Uses comfyPage.page since test code uses comfyPageFixture
-          testCode = testCode.replace(
-            /(\n\s*)(await\s+(?:comfyPage|topbar|firstNode|page|canvas|expect))/g,
-            '$1await comfyPage.page.waitForTimeout(800);\n$1$2'
-          )
-          writeFileSync(browserTestFile, testCode)
+          // Write the test as-is — demowright handles cursor, keyboard, slowdown
+          writeFileSync(browserTestFile, research.testCode)
          try {
            const output = execSync(
-              `cd "${projectRoot}" && npx playwright test browser_tests/tests/qa-reproduce.spec.ts --reporter=list --timeout=30000 --retries=0 --workers=1 --output="${testResultsDir}" 2>&1`,
+              `cd "${projectRoot}" && npx playwright test browser_tests/tests/qa-reproduce.spec.ts --reporter=list --timeout=60000 --retries=0 --workers=1 --output="${testResultsDir}" 2>&1`,
              {
-                timeout: 90000,
+                timeout: 120000,
                encoding: 'utf-8',
                env: {
                  ...process.env,
                  COMFYUI_BASE_URL: opts.serverUrl,
-                  PLAYWRIGHT_LOCAL: '1' // Enables video=on + trace=on in playwright.config.ts
+                  PLAYWRIGHT_LOCAL: '1',
+                  NODE_OPTIONS: '--require demowright/register',
+                  QA_HUD_DELAY: '300',
+                  QA_HUD_CURSOR_STYLE: 'default',
+                  QA_HUD_KEY_FADE: '2000'
                }
              }
            )