Compare commits
29 Commits
dev/remote
...
worktree-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d92acd81b6 | ||
|
|
bbd0a6b201 | ||
|
|
6c1bf7a3cf | ||
|
|
b61e15293c | ||
|
|
899660b135 | ||
|
|
aeafff1ead | ||
|
|
f4fb7a458e | ||
|
|
71a3bd92b4 | ||
|
|
17d2870ef4 | ||
|
|
7a68943839 | ||
|
|
8912f4159a | ||
|
|
794b986954 | ||
|
|
a7b3515692 | ||
|
|
26f3f11a3e | ||
|
|
d9466947b2 | ||
|
|
bb96e3c95c | ||
|
|
df42b7a2a8 | ||
|
|
4f3a5ae184 | ||
|
|
c77c8a9476 | ||
|
|
380fae9a0d | ||
|
|
515f234143 | ||
|
|
61049425a3 | ||
|
|
661e3d7949 | ||
|
|
1624750a02 | ||
|
|
4cbf4994e9 | ||
|
|
86a3938d11 | ||
|
|
e11a1776ed | ||
|
|
161522b138 | ||
|
|
61144ea1d5 |
182
.github/workflows/hub-ci.yaml
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
name: Hub CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'apps/hub/**'
|
||||
- '.github/workflows/hub-ci.yaml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'apps/hub/**'
|
||||
- '.github/workflows/hub-ci.yaml'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint & Check
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: apps/hub
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Astro Check
|
||||
run: pnpm run check
|
||||
|
||||
- name: Unit Tests
|
||||
run: pnpm test
|
||||
|
||||
- name: Validate Templates
|
||||
run: pnpm run validate:templates
|
||||
continue-on-error: true
|
||||
|
||||
build:
|
||||
name: Build Hub
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: apps/hub
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Build site
|
||||
run: pnpm run build
|
||||
env:
|
||||
HUB_SKIP_SYNC: 'true'
|
||||
SKIP_AI_GENERATION: 'true'
|
||||
|
||||
- name: Upload build artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: hub-build
|
||||
path: apps/hub/dist
|
||||
retention-days: 1
|
||||
|
||||
seo-audit:
|
||||
name: SEO Audit
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: apps/hub
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Download build artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: hub-build
|
||||
path: apps/hub/dist
|
||||
|
||||
- name: Validate sitemap
|
||||
id: sitemap
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo "## Sitemap Validation" >> $GITHUB_STEP_SUMMARY
|
||||
if pnpm run validate:sitemap 2>&1 | tee sitemap-output.txt; then
|
||||
echo "✅ Sitemap validation passed" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=passed" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "❌ Sitemap validation failed" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=failed" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Run SEO audit
|
||||
id: seo
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo "## SEO Audit" >> $GITHUB_STEP_SUMMARY
|
||||
if pnpm run audit:seo 2>&1 | tee seo-output.txt; then
|
||||
echo "✅ SEO audit passed" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=passed" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "⚠️ SEO audit found issues" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=issues" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Check internal links
|
||||
id: links
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo "## Link Check" >> $GITHUB_STEP_SUMMARY
|
||||
DIST_DIR="dist"
|
||||
if [ ! -d "$DIST_DIR" ]; then
|
||||
echo "⚠️ No build output found at $DIST_DIR" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=skipped" >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
fi
|
||||
|
||||
BROKEN_FILE="broken-links.txt"
|
||||
: > "$BROKEN_FILE"
|
||||
BROKEN_COUNT=0
|
||||
TOTAL_COUNT=0
|
||||
|
||||
for htmlfile in $(find "$DIST_DIR" -name '*.html' \
|
||||
-not -path "$DIST_DIR/ar/*" -not -path "$DIST_DIR/es/*" -not -path "$DIST_DIR/fr/*" \
|
||||
-not -path "$DIST_DIR/ja/*" -not -path "$DIST_DIR/ko/*" -not -path "$DIST_DIR/pt-BR/*" \
|
||||
-not -path "$DIST_DIR/ru/*" -not -path "$DIST_DIR/tr/*" -not -path "$DIST_DIR/zh/*" \
|
||||
-not -path "$DIST_DIR/zh-TW/*" | head -500); do
|
||||
hrefs=$(grep -oP 'href="(/[^"]*)"' "$htmlfile" | sed 's/href="//;s/"$//' || true)
|
||||
for href in $hrefs; do
|
||||
TOTAL_COUNT=$((TOTAL_COUNT + 1))
|
||||
clean="${href%%#*}"
|
||||
clean="${clean%%\?*}"
|
||||
if [ -z "$clean" ] || [ "$clean" = "/" ]; then continue; fi
|
||||
found=false
|
||||
if [[ "$clean" =~ \.[a-zA-Z0-9]+$ ]]; then
|
||||
[ -f "${DIST_DIR}${clean}" ] && found=true
|
||||
else
|
||||
base="${clean%/}"
|
||||
[ -f "${DIST_DIR}${base}/index.html" ] && found=true
|
||||
[ "$found" = false ] && [ -f "${DIST_DIR}${base}.html" ] && found=true
|
||||
[ "$found" = false ] && [ -f "${DIST_DIR}${clean}" ] && found=true
|
||||
[ "$found" = false ] && [ -d "${DIST_DIR}${base}" ] && found=true
|
||||
fi
|
||||
if [ "$found" = false ]; then
|
||||
BROKEN_COUNT=$((BROKEN_COUNT + 1))
|
||||
echo "- \`${href}\` (in ${htmlfile#${DIST_DIR}/})" >> "$BROKEN_FILE"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [ "$BROKEN_COUNT" -eq 0 ]; then
|
||||
echo "✅ All internal links valid ($TOTAL_COUNT checked)" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=passed" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "❌ Found $BROKEN_COUNT broken internal links out of $TOTAL_COUNT" >> $GITHUB_STEP_SUMMARY
|
||||
head -n 50 "$BROKEN_FILE" >> $GITHUB_STEP_SUMMARY
|
||||
echo "status=failed" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Upload SEO reports
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: hub-seo-reports
|
||||
path: |
|
||||
apps/hub/seo-output.txt
|
||||
apps/hub/seo-summary.json
|
||||
apps/hub/broken-links.txt
|
||||
if-no-files-found: ignore
|
||||
68
.github/workflows/hub-cron-rebuild.yaml
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
name: Hub Cron Rebuild
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 15 minutes — rebuilds the site to pick up new UGC workflows
|
||||
# for search index, sitemap, filter pages, and pre-rendered detail pages.
|
||||
- cron: '*/15 * * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: hub-deploy-prod
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
rebuild:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
SKIP_AI_GENERATION: 'true'
|
||||
PUBLIC_POSTHOG_KEY: ${{ secrets.HUB_POSTHOG_KEY }}
|
||||
PUBLIC_GA_MEASUREMENT_ID: ${{ secrets.HUB_GA_MEASUREMENT_ID }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Checkout templates data
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
repository: Comfy-Org/workflow_templates
|
||||
path: _workflow_templates
|
||||
sparse-checkout: templates
|
||||
token: ${{ secrets.GH_TOKEN }}
|
||||
|
||||
- name: Restore content cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: apps/hub/.content-cache
|
||||
key: hub-content-cache-cron-prod-${{ hashFiles('_workflow_templates/templates/**', 'apps/hub/src/**') }}
|
||||
restore-keys: |
|
||||
hub-content-cache-cron-prod-
|
||||
|
||||
- name: Sync templates
|
||||
run: pnpm run sync
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
HUB_TEMPLATES_DIR: ${{ github.workspace }}/_workflow_templates/templates
|
||||
|
||||
- name: Build Astro site
|
||||
run: pnpm run build
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
PUBLIC_HUB_API_URL: ${{ secrets.HUB_API_URL_PRODUCTION }}
|
||||
PUBLIC_COMFY_CLOUD_URL: ${{ secrets.COMFY_CLOUD_URL_PRODUCTION }}
|
||||
PUBLIC_APPROVED_ONLY: 'true'
|
||||
|
||||
- name: Deploy to Vercel
|
||||
uses: amondnet/vercel-action@v25
|
||||
with:
|
||||
vercel-token: ${{ secrets.VERCEL_TOKEN }}
|
||||
vercel-org-id: ${{ secrets.VERCEL_ORG_ID }}
|
||||
vercel-project-id: ${{ secrets.HUB_VERCEL_PROJECT_ID }}
|
||||
working-directory: apps/hub
|
||||
vercel-args: '--prebuilt --prod'
|
||||
80
.github/workflows/hub-deploy.yaml
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
name: Deploy Hub
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
skip_ai:
|
||||
description: 'Skip AI content generation'
|
||||
type: boolean
|
||||
default: false
|
||||
force_regenerate:
|
||||
description: 'Force regenerate all content (ignore cache)'
|
||||
type: boolean
|
||||
default: false
|
||||
template_filter:
|
||||
description: 'Regenerate specific template only (e.g. "flux_schnell")'
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
concurrency:
|
||||
group: hub-deploy-prod
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
PUBLIC_POSTHOG_KEY: ${{ secrets.HUB_POSTHOG_KEY }}
|
||||
PUBLIC_GA_MEASUREMENT_ID: ${{ secrets.HUB_GA_MEASUREMENT_ID }}
|
||||
SKIP_AI_GENERATION: ${{ inputs.skip_ai && 'true' || '' }}
|
||||
FORCE_AI_REGENERATE: ${{ inputs.force_regenerate && 'true' || '' }}
|
||||
AI_TEMPLATE_FILTER: ${{ inputs.template_filter }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Checkout templates data
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
repository: Comfy-Org/workflow_templates
|
||||
path: _workflow_templates
|
||||
sparse-checkout: templates
|
||||
token: ${{ secrets.GH_TOKEN }}
|
||||
|
||||
- name: Restore content cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: apps/hub/.content-cache
|
||||
key: hub-content-cache-${{ hashFiles('_workflow_templates/templates/**', 'apps/hub/src/**') }}
|
||||
restore-keys: |
|
||||
hub-content-cache-
|
||||
|
||||
- name: Sync templates
|
||||
run: pnpm run sync
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
HUB_TEMPLATES_DIR: ${{ github.workspace }}/_workflow_templates/templates
|
||||
|
||||
- name: Build Astro site
|
||||
run: pnpm run build
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
PUBLIC_HUB_API_URL: ${{ secrets.HUB_API_URL_PRODUCTION }}
|
||||
PUBLIC_COMFY_CLOUD_URL: ${{ secrets.COMFY_CLOUD_URL_PRODUCTION }}
|
||||
PUBLIC_APPROVED_ONLY: 'true'
|
||||
|
||||
- name: Deploy to Vercel
|
||||
uses: amondnet/vercel-action@v25
|
||||
with:
|
||||
vercel-token: ${{ secrets.VERCEL_TOKEN }}
|
||||
vercel-org-id: ${{ secrets.VERCEL_ORG_ID }}
|
||||
vercel-project-id: ${{ secrets.HUB_VERCEL_PROJECT_ID }}
|
||||
working-directory: apps/hub
|
||||
vercel-args: '--prebuilt --prod'
|
||||
134
.github/workflows/hub-preview-cron.yaml
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
name: Hub Preview Cron
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/15 * * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
discover:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.targets.outputs.matrix }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Build rebuild targets
|
||||
id: targets
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
targets='[]'
|
||||
|
||||
# Main with production API (all workflows, no approved filter)
|
||||
targets=$(echo "$targets" | jq -c '. + [{"ref": "main", "is_main": true, "pr": 0, "api_env": "production"}]')
|
||||
|
||||
# Main with test API
|
||||
targets=$(echo "$targets" | jq -c '. + [{"ref": "main", "is_main": true, "pr": 0, "api_env": "test"}]')
|
||||
|
||||
# Find open PRs with the "preview-cron" label
|
||||
prs=$(gh pr list --label "preview-cron" --state open --json number,headRefName)
|
||||
for row in $(echo "$prs" | jq -c '.[]'); do
|
||||
ref=$(echo "$row" | jq -r '.headRefName')
|
||||
num=$(echo "$row" | jq -r '.number')
|
||||
targets=$(echo "$targets" | jq -c \
|
||||
--arg ref "$ref" --argjson num "$num" \
|
||||
'. + [{"ref": $ref, "is_main": false, "pr": $num, "api_env": "test"}]')
|
||||
done
|
||||
|
||||
echo "matrix={\"include\":$targets}" >> "$GITHUB_OUTPUT"
|
||||
echo "### Rebuild targets" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "$targets" | jq '.' >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
rebuild:
|
||||
needs: discover
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{ fromJson(needs.discover.outputs.matrix) }}
|
||||
concurrency:
|
||||
group: hub-preview-cron-${{ matrix.ref }}-${{ matrix.api_env }}
|
||||
cancel-in-progress: true
|
||||
env:
|
||||
SKIP_AI_GENERATION: 'true'
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ matrix.ref }}
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Checkout templates data
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
repository: Comfy-Org/workflow_templates
|
||||
path: _workflow_templates
|
||||
sparse-checkout: templates
|
||||
token: ${{ secrets.GH_TOKEN }}
|
||||
|
||||
- name: Restore content cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: apps/hub/.content-cache
|
||||
key: hub-content-cache-cron-${{ matrix.ref }}-${{ matrix.api_env }}-${{ hashFiles('_workflow_templates/templates/**', 'apps/hub/src/**') }}
|
||||
restore-keys: |
|
||||
hub-content-cache-cron-${{ matrix.ref }}-${{ matrix.api_env }}-
|
||||
|
||||
- name: Sync templates
|
||||
run: pnpm run sync:en-only
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
HUB_TEMPLATES_DIR: ${{ github.workspace }}/_workflow_templates/templates
|
||||
|
||||
- name: Build Astro site
|
||||
run: pnpm run build
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
PUBLIC_HUB_API_URL: ${{ matrix.api_env == 'test' && secrets.HUB_API_URL_PREVIEW || secrets.HUB_API_URL_PRODUCTION }}
|
||||
PUBLIC_COMFY_CLOUD_URL: ${{ matrix.api_env == 'test' && secrets.COMFY_CLOUD_URL_PREVIEW || secrets.COMFY_CLOUD_URL_PRODUCTION }}
|
||||
|
||||
- name: Deploy to Vercel
|
||||
id: deploy
|
||||
uses: amondnet/vercel-action@v25
|
||||
with:
|
||||
vercel-token: ${{ secrets.VERCEL_TOKEN }}
|
||||
vercel-org-id: ${{ secrets.VERCEL_ORG_ID }}
|
||||
vercel-project-id: ${{ secrets.HUB_VERCEL_PROJECT_ID }}
|
||||
working-directory: apps/hub
|
||||
vercel-args: '--prebuilt'
|
||||
|
||||
- name: Alias main preview (prod API)
|
||||
if: matrix.is_main && matrix.api_env == 'production' && secrets.HUB_PREVIEW_ALIAS
|
||||
env:
|
||||
PREVIEW_URL: ${{ steps.deploy.outputs.preview-url }}
|
||||
ALIAS: ${{ secrets.HUB_PREVIEW_ALIAS }}
|
||||
VERCEL_TOKEN_VAL: ${{ secrets.VERCEL_TOKEN }}
|
||||
VERCEL_SCOPE: ${{ secrets.VERCEL_ORG_ID }}
|
||||
run: |
|
||||
npx vercel alias "$PREVIEW_URL" "$ALIAS" --token="$VERCEL_TOKEN_VAL" --scope="$VERCEL_SCOPE"
|
||||
|
||||
- name: Alias main preview (test API)
|
||||
if: matrix.is_main && matrix.api_env == 'test' && secrets.HUB_PREVIEW_TEST_ALIAS
|
||||
env:
|
||||
PREVIEW_URL: ${{ steps.deploy.outputs.preview-url }}
|
||||
ALIAS: ${{ secrets.HUB_PREVIEW_TEST_ALIAS }}
|
||||
VERCEL_TOKEN_VAL: ${{ secrets.VERCEL_TOKEN }}
|
||||
VERCEL_SCOPE: ${{ secrets.VERCEL_ORG_ID }}
|
||||
run: |
|
||||
npx vercel alias "$PREVIEW_URL" "$ALIAS" --token="$VERCEL_TOKEN_VAL" --scope="$VERCEL_SCOPE"
|
||||
|
||||
- name: Comment preview URL on PR
|
||||
if: matrix.pr > 0
|
||||
uses: marocchino/sticky-pull-request-comment@v2
|
||||
with:
|
||||
number: ${{ matrix.pr }}
|
||||
header: hub-preview-cron
|
||||
message: |
|
||||
🔄 **Hub preview cron rebuilt:** ${{ steps.deploy.outputs.preview-url }}
|
||||
_Last rebuild: ${{ github.event.head_commit.timestamp || 'manual trigger' }}_
|
||||
74
.github/workflows/hub-preview.yaml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: Hub Preview
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'apps/hub/**'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: hub-preview-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
preview:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
SKIP_AI_GENERATION: 'true'
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup frontend
|
||||
uses: ./.github/actions/setup-frontend
|
||||
|
||||
- name: Checkout templates data
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
repository: Comfy-Org/workflow_templates
|
||||
path: _workflow_templates
|
||||
sparse-checkout: templates
|
||||
token: ${{ secrets.GH_TOKEN }}
|
||||
|
||||
- name: Restore content cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: apps/hub/.content-cache
|
||||
key: hub-content-cache-preview-${{ hashFiles('_workflow_templates/templates/**', 'apps/hub/src/**') }}
|
||||
restore-keys: |
|
||||
hub-content-cache-preview-
|
||||
|
||||
- name: Sync templates
|
||||
run: pnpm run sync:en-only
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
HUB_TEMPLATES_DIR: ${{ github.workspace }}/_workflow_templates/templates
|
||||
|
||||
- name: Build Astro site
|
||||
run: pnpm run build
|
||||
working-directory: apps/hub
|
||||
env:
|
||||
PUBLIC_HUB_API_URL: ${{ secrets.HUB_API_URL_PREVIEW }}
|
||||
PUBLIC_COMFY_CLOUD_URL: ${{ secrets.COMFY_CLOUD_URL_PREVIEW }}
|
||||
|
||||
- name: Deploy preview to Vercel
|
||||
id: deploy
|
||||
uses: amondnet/vercel-action@v25
|
||||
with:
|
||||
vercel-token: ${{ secrets.VERCEL_TOKEN }}
|
||||
vercel-org-id: ${{ secrets.VERCEL_ORG_ID }}
|
||||
vercel-project-id: ${{ secrets.HUB_VERCEL_PROJECT_ID }}
|
||||
working-directory: apps/hub
|
||||
vercel-args: '--prebuilt'
|
||||
|
||||
- name: Comment preview URL
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: marocchino/sticky-pull-request-comment@v2
|
||||
with:
|
||||
header: hub-vercel-preview
|
||||
message: |
|
||||
🚀 **Hub preview deployed:** ${{ steps.deploy.outputs.preview-url }}
|
||||
@@ -12,6 +12,8 @@
|
||||
"playwright-report/*",
|
||||
"src/extensions/core/*",
|
||||
"src/scripts/*",
|
||||
"apps/hub/scripts/**/*",
|
||||
"apps/hub/src/scripts/*",
|
||||
"src/types/generatedManagerTypes.ts",
|
||||
"src/types/vue-shim.d.ts",
|
||||
"test-results/*",
|
||||
|
||||
@@ -69,6 +69,9 @@
|
||||
/src/renderer/extensions/vueNodes/widgets/composables/usePainterWidget.ts @jtydhr88
|
||||
/src/lib/litegraph/src/widgets/PainterWidget.ts @jtydhr88
|
||||
|
||||
# GLSL
|
||||
/src/renderer/glsl/ @jtydhr88 @pythongosssss @christian-byrne
|
||||
|
||||
# 3D
|
||||
/src/extensions/core/load3d.ts @jtydhr88
|
||||
/src/extensions/core/load3dLazy.ts @jtydhr88
|
||||
|
||||
9
apps/hub/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
dist/
|
||||
.astro/
|
||||
.content-cache/
|
||||
src/content/templates/
|
||||
public/workflows/thumbnails/
|
||||
public/workflows/avatars/
|
||||
public/previews/
|
||||
public/search-index.json
|
||||
knowledge/tutorials/
|
||||
213
apps/hub/astro.config.mjs
Normal file
@@ -0,0 +1,213 @@
|
||||
import { defineConfig } from 'astro/config';
|
||||
import sitemap from '@astrojs/sitemap';
|
||||
import vercel from '@astrojs/vercel';
|
||||
import tailwindcss from '@tailwindcss/vite';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
|
||||
import vue from '@astrojs/vue';
|
||||
|
||||
// Build template date lookup at config time
|
||||
const templatesDir = path.join(process.cwd(), 'src/content/templates');
|
||||
const templateDates = new Map();
|
||||
|
||||
if (fs.existsSync(templatesDir)) {
|
||||
const files = fs.readdirSync(templatesDir).filter((f) => f.endsWith('.json'));
|
||||
for (const file of files) {
|
||||
try {
|
||||
const content = JSON.parse(fs.readFileSync(path.join(templatesDir, file), 'utf-8'));
|
||||
if (content.name && content.date) {
|
||||
templateDates.set(content.name, content.date);
|
||||
}
|
||||
} catch {
|
||||
// Skip invalid JSON files
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build timestamp used as lastmod fallback for pages without a specific date
|
||||
const buildDate = new Date().toISOString();
|
||||
|
||||
// Supported locales (matches src/i18n/config.ts)
|
||||
const locales = ['en', 'zh', 'zh-TW', 'ja', 'ko', 'es', 'fr', 'ru', 'tr', 'ar', 'pt-BR'];
|
||||
const nonDefaultLocales = locales.filter((l) => l !== 'en');
|
||||
|
||||
// Custom sitemap pages for ISR routes not discovered at build time
|
||||
const siteOrigin = (process.env.PUBLIC_SITE_ORIGIN || 'https://www.comfy.org').replace(/\/$/, '');
|
||||
|
||||
// Creator profile pages — extract unique usernames from synced templates
|
||||
const creatorUsernames = new Set();
|
||||
if (fs.existsSync(templatesDir)) {
|
||||
const files = fs.readdirSync(templatesDir).filter((f) => f.endsWith('.json'));
|
||||
for (const file of files) {
|
||||
try {
|
||||
const content = JSON.parse(fs.readFileSync(path.join(templatesDir, file), 'utf-8'));
|
||||
if (content.username) creatorUsernames.add(content.username);
|
||||
} catch {
|
||||
// Skip invalid JSON
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const creatorPages = [...creatorUsernames].map((u) => `${siteOrigin}/workflows/${u}/`);
|
||||
const localeCustomPages = nonDefaultLocales.map((locale) =>
|
||||
`${siteOrigin}/${locale}/workflows/`
|
||||
);
|
||||
const customPages = [...creatorPages, ...localeCustomPages];
|
||||
|
||||
// https://astro.build/config
|
||||
export default defineConfig({
|
||||
site: (process.env.PUBLIC_SITE_ORIGIN || 'https://www.comfy.org').replace(/\/$/, ''),
|
||||
prefetch: {
|
||||
prefetchAll: false,
|
||||
defaultStrategy: 'hover',
|
||||
},
|
||||
i18n: {
|
||||
defaultLocale: 'en',
|
||||
locales: locales,
|
||||
routing: {
|
||||
prefixDefaultLocale: false, // English at root, others prefixed (/zh/, /ja/, etc.)
|
||||
},
|
||||
},
|
||||
integrations: [
|
||||
sitemap({
|
||||
// Use custom filename to avoid collision with Framer's /sitemap.xml
|
||||
filenameBase: 'sitemap-workflows',
|
||||
// Include Framer's marketing sitemap in the index
|
||||
customSitemaps: ['https://www.comfy.org/sitemap.xml'],
|
||||
// Include on-demand locale pages that aren't discovered at build time
|
||||
customPages: customPages,
|
||||
serialize(item) {
|
||||
const url = new URL(item.url);
|
||||
const pathname = url.pathname;
|
||||
|
||||
// Template detail pages: /workflows/{slug}/ or /{locale}/workflows/{slug}/
|
||||
const templateMatch = pathname.match(
|
||||
/^(?:\/([a-z]{2}(?:-[A-Z]{2})?))?\/workflows\/([^/]+)\/?$/
|
||||
);
|
||||
if (templateMatch) {
|
||||
const slug = templateMatch[2];
|
||||
const date = templateDates.get(slug);
|
||||
item.lastmod = date ? new Date(date).toISOString() : buildDate;
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'monthly';
|
||||
item.priority = 0.8;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Homepage
|
||||
if (pathname === '/' || pathname === '') {
|
||||
item.lastmod = buildDate;
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'daily';
|
||||
item.priority = 1.0;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Workflows index (including localized versions)
|
||||
if (pathname.match(/^(?:\/[a-z]{2}(?:-[A-Z]{2})?)?\/workflows\/?$/)) {
|
||||
item.lastmod = buildDate;
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'daily';
|
||||
item.priority = 0.9;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Category pages: /workflows/category/{type}/ or /{locale}/workflows/category/{type}/
|
||||
if (pathname.match(/^(?:\/[a-z]{2}(?:-[A-Z]{2})?)?\/workflows\/category\//)) {
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'weekly';
|
||||
item.priority = 0.7;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Model pages: /workflows/model/{model}/ or /{locale}/workflows/model/{model}/
|
||||
if (pathname.match(/^(?:\/[a-z]{2}(?:-[A-Z]{2})?)?\/workflows\/model\//)) {
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'weekly';
|
||||
item.priority = 0.6;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Tag pages: /workflows/tag/{tag}/ or /{locale}/workflows/tag/{tag}/
|
||||
if (pathname.match(/^(?:\/[a-z]{2}(?:-[A-Z]{2})?)?\/workflows\/tag\//)) {
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'weekly';
|
||||
item.priority = 0.6;
|
||||
return item;
|
||||
}
|
||||
|
||||
// Default for other pages
|
||||
// @ts-expect-error - sitemap types are stricter than actual API
|
||||
item.changefreq = 'weekly';
|
||||
item.priority = 0.5;
|
||||
return item;
|
||||
},
|
||||
// Exclude OG image routes and legacy redirect pages from sitemap.
|
||||
// Legacy redirects are /workflows/{slug}/ without a 12-char hex share_id suffix.
|
||||
// Canonical detail pages are /workflows/{slug}-{shareId}/ (shareId = 12 hex chars).
|
||||
filter: (page) => {
|
||||
if (page.includes('/workflows/og/') || page.includes('/workflows/og.png')) return false;
|
||||
// Check if this is a workflow detail path (not category/tag/model/creators)
|
||||
const match = page.match(/\/workflows\/([^/]+)\/$/);
|
||||
if (match) {
|
||||
const segment = match[1];
|
||||
// Skip known sub-paths
|
||||
if (['category', 'tag', 'model', 'creators'].some((p) => page.includes(`/workflows/${p}/`))) return true;
|
||||
// Include if it has a share_id suffix (12 hex chars after last hyphen)
|
||||
const lastHyphen = segment.lastIndexOf('-');
|
||||
if (lastHyphen === -1) return false; // No hyphen = legacy redirect
|
||||
const candidate = segment.slice(lastHyphen + 1);
|
||||
if (candidate.length === 12 && /^[0-9a-f]+$/.test(candidate)) return true;
|
||||
return false; // Has hyphen but not a valid share_id = legacy redirect
|
||||
}
|
||||
return true;
|
||||
},
|
||||
}),
|
||||
vue(),
|
||||
],
|
||||
output: 'static',
|
||||
adapter: vercel({
|
||||
webAnalytics: { enabled: true },
|
||||
skewProtection: true,
|
||||
}),
|
||||
|
||||
// Build performance optimizations
|
||||
build: {
|
||||
// Increase concurrency for faster builds on multi-core systems
|
||||
concurrency: Math.max(1, os.cpus().length),
|
||||
// Inline small stylesheets automatically
|
||||
inlineStylesheets: 'auto',
|
||||
},
|
||||
|
||||
// HTML compression
|
||||
compressHTML: true,
|
||||
|
||||
// Image optimization settings
|
||||
image: {
|
||||
service: {
|
||||
entrypoint: 'astro/assets/services/sharp',
|
||||
config: {
|
||||
// Limit input pixels to prevent memory issues with large images
|
||||
limitInputPixels: 268402689, // ~16384x16384
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
// Responsive images for automatic srcset generation (now stable in Astro 5)
|
||||
// Note: responsiveImages was moved from experimental to stable in Astro 5.x
|
||||
|
||||
vite: {
|
||||
plugins: [tailwindcss()],
|
||||
build: {
|
||||
chunkSizeWarningLimit: 1000,
|
||||
},
|
||||
optimizeDeps: {
|
||||
include: ['web-vitals'],
|
||||
},
|
||||
css: {
|
||||
devSourcemap: false,
|
||||
},
|
||||
},
|
||||
});
|
||||
22
apps/hub/knowledge/concepts/3d-generation.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# 3D Generation
|
||||
|
||||
3D generation creates three-dimensional models — meshes, point clouds, or multi-view images — from text or image inputs. This enables rapid prototyping of 3D assets without manual modeling. In ComfyUI, several approaches exist: image-to-3D (lifting a single photo into a mesh), text-to-3D (generating a 3D object from a description), and multi-view generation (producing consistent views of an object that can be reconstructed into 3D).
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: Model-specific loaders (`TripoSR`, `InstantMesh`, `StableZero123`), `LoadImage`, `Save3D` / `Preview3D`, `CRM` nodes
|
||||
- Typical workflow pattern: Load image → Load 3D model → Run inference → Preview 3D result → Export mesh
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Inference steps**: Number of denoising/reconstruction steps. More steps generally improve quality but increase generation time.
|
||||
- **Elevation angle**: Camera elevation for multi-view generation, controlling the vertical viewing angle of the generated views.
|
||||
- **Guidance scale**: How closely the model follows the input image or text. Higher values increase fidelity to the input but may reduce diversity.
|
||||
- **Output format**: Export format for the 3D mesh — OBJ, GLB, and PLY are common options, each suited to different downstream tools.
|
||||
|
||||
## Tips
|
||||
|
||||
- Clean single-object images on white or simple backgrounds work best for image-to-3D conversion.
|
||||
- Multi-view approaches (like Zero123) often produce better geometry than single-view methods.
|
||||
- Post-process generated meshes in Blender for cleanup, retopology, or texturing before production use.
|
||||
- Start with TripoSR for quick results — it generates meshes in seconds and is a good baseline to compare against other methods.
|
||||
374
apps/hub/knowledge/concepts/_template-index.json
Normal file
@@ -0,0 +1,374 @@
|
||||
{
|
||||
"text-to-image": [
|
||||
"01_get_started_text_to_image",
|
||||
"api_bfl_flux2_max_sofa_swap",
|
||||
"api_bfl_flux_1_kontext_max_image",
|
||||
"api_bfl_flux_1_kontext_multiple_images_input",
|
||||
"api_bfl_flux_1_kontext_pro_image",
|
||||
"api_bfl_flux_pro_t2i",
|
||||
"api_bytedance_seedream4",
|
||||
"api_flux2",
|
||||
"api_from_photo_2_miniature",
|
||||
"api_google_gemini_image",
|
||||
"api_grok_text_to_image",
|
||||
"api_ideogram_v3_t2i",
|
||||
"api_kling_omni_image",
|
||||
"api_luma_photon_i2i",
|
||||
"api_luma_photon_style_ref",
|
||||
"api_nano_banana_pro",
|
||||
"api_openai_dall_e_2_inpaint",
|
||||
"api_openai_dall_e_2_t2i",
|
||||
"api_openai_dall_e_3_t2i",
|
||||
"api_openai_fashion_billboard_generator",
|
||||
"api_openai_image_1_i2i",
|
||||
"api_openai_image_1_inpaint",
|
||||
"api_openai_image_1_multi_inputs",
|
||||
"api_openai_image_1_t2i",
|
||||
"api_recraft_image_gen_with_color_control",
|
||||
"api_recraft_image_gen_with_style_control",
|
||||
"api_recraft_style_reference",
|
||||
"api_recraft_vector_gen",
|
||||
"api_runway_reference_to_image",
|
||||
"api_runway_text_to_image",
|
||||
"api_stability_ai_i2i",
|
||||
"api_stability_ai_sd3.5_i2i",
|
||||
"api_stability_ai_sd3.5_t2i",
|
||||
"api_stability_ai_stable_image_ultra_t2i",
|
||||
"api_wan_text_to_image",
|
||||
"default",
|
||||
"flux1_dev_uso_reference_image_gen",
|
||||
"flux1_krea_dev",
|
||||
"flux_canny_model_example",
|
||||
"flux_depth_lora_example",
|
||||
"flux_dev_checkpoint_example",
|
||||
"flux_dev_full_text_to_image",
|
||||
"flux_fill_inpaint_example",
|
||||
"flux_fill_outpaint_example",
|
||||
"flux_redux_model_example",
|
||||
"flux_schnell",
|
||||
"flux_schnell_full_text_to_image",
|
||||
"hidream_e1_1",
|
||||
"hidream_e1_full",
|
||||
"hidream_i1_dev",
|
||||
"hidream_i1_fast",
|
||||
"hidream_i1_full",
|
||||
"image-qwen_image_edit_2511_lora_inflation",
|
||||
"image_anima_preview",
|
||||
"image_chroma1_radiance_text_to_image",
|
||||
"image_chroma_text_to_image",
|
||||
"image_flux2",
|
||||
"image_flux2_fp8",
|
||||
"image_flux2_klein_image_edit_4b_base",
|
||||
"image_flux2_klein_image_edit_4b_distilled",
|
||||
"image_flux2_klein_image_edit_9b_base",
|
||||
"image_flux2_klein_image_edit_9b_distilled",
|
||||
"image_flux2_klein_text_to_image",
|
||||
"image_flux2_text_to_image",
|
||||
"image_flux2_text_to_image_9b",
|
||||
"image_kandinsky5_t2i",
|
||||
"image_lotus_depth_v1_1",
|
||||
"image_netayume_lumina_t2i",
|
||||
"image_newbieimage_exp0_1-t2i",
|
||||
"image_omnigen2_image_edit",
|
||||
"image_omnigen2_t2i",
|
||||
"image_ovis_text_to_image",
|
||||
"image_qwen_Image_2512",
|
||||
"image_qwen_image",
|
||||
"image_qwen_image_2512_with_2steps_lora",
|
||||
"image_qwen_image_controlnet_patch",
|
||||
"image_qwen_image_instantx_controlnet",
|
||||
"image_qwen_image_instantx_inpainting_controlnet",
|
||||
"image_qwen_image_union_control_lora",
|
||||
"image_z_image",
|
||||
"image_z_image_turbo",
|
||||
"image_z_image_turbo_fun_union_controlnet",
|
||||
"sd3.5_large_blur",
|
||||
"sd3.5_large_canny_controlnet_example",
|
||||
"sd3.5_large_depth",
|
||||
"sd3.5_simple_example",
|
||||
"sdxl_refiner_prompt_example",
|
||||
"sdxl_revision_text_prompts",
|
||||
"sdxl_simple_example",
|
||||
"sdxlturbo_example",
|
||||
"templates-9grid_social_media-v2.0"
|
||||
],
|
||||
"img2img": [
|
||||
"02_qwen_Image_edit_subgraphed",
|
||||
"api_luma_photon_i2i",
|
||||
"api_meshy_multi_image_to_model",
|
||||
"api_openai_image_1_i2i",
|
||||
"api_runway_reference_to_image",
|
||||
"api_stability_ai_i2i",
|
||||
"api_stability_ai_sd3.5_i2i",
|
||||
"flux1_dev_uso_reference_image_gen",
|
||||
"flux_canny_model_example",
|
||||
"flux_depth_lora_example",
|
||||
"flux_fill_inpaint_example",
|
||||
"flux_fill_outpaint_example",
|
||||
"flux_kontext_dev_basic",
|
||||
"flux_redux_model_example",
|
||||
"image_chrono_edit_14B",
|
||||
"image_qwen_image_edit",
|
||||
"image_qwen_image_edit_2509",
|
||||
"image_qwen_image_instantx_controlnet",
|
||||
"image_qwen_image_instantx_inpainting_controlnet",
|
||||
"sd3.5_large_blur",
|
||||
"sd3.5_large_canny_controlnet_example",
|
||||
"sd3.5_large_depth"
|
||||
],
|
||||
"inpainting": [
|
||||
"api_openai_dall_e_2_inpaint",
|
||||
"api_openai_image_1_inpaint",
|
||||
"api_stability_ai_audio_inpaint",
|
||||
"flux_fill_inpaint_example",
|
||||
"flux_fill_outpaint_example",
|
||||
"image_flux.1_fill_dev_OneReward",
|
||||
"image_qwen_image_instantx_inpainting_controlnet",
|
||||
"video_wan2_2_14B_fun_inpaint",
|
||||
"video_wan2_2_5B_fun_inpaint",
|
||||
"video_wan_vace_inpainting",
|
||||
"wan2.1_fun_inp"
|
||||
],
|
||||
"outpainting": [
|
||||
"api_bria_image_outpainting",
|
||||
"flux_fill_outpaint_example",
|
||||
"image_flux.1_fill_dev_OneReward",
|
||||
"video_wan_vace_outpainting"
|
||||
],
|
||||
"controlnet": [
|
||||
"02_qwen_Image_edit_subgraphed",
|
||||
"flux_canny_model_example",
|
||||
"flux_depth_lora_example",
|
||||
"flux_redux_model_example",
|
||||
"image_lotus_depth_v1_1",
|
||||
"image_qwen_image_controlnet_patch",
|
||||
"image_qwen_image_edit_2509",
|
||||
"image_qwen_image_instantx_controlnet",
|
||||
"image_qwen_image_instantx_inpainting_controlnet",
|
||||
"image_qwen_image_union_control_lora",
|
||||
"image_z_image_turbo_fun_union_controlnet",
|
||||
"sd3.5_large_canny_controlnet_example",
|
||||
"sd3.5_large_depth",
|
||||
"utility-depthAnything-v2-relative-video",
|
||||
"utility-frame_interpolation-film",
|
||||
"utility-lineart-video",
|
||||
"utility-normal_crafter-video",
|
||||
"utility-openpose-video",
|
||||
"video_ltx2_canny_to_video",
|
||||
"video_ltx2_depth_to_video",
|
||||
"video_ltx2_pose_to_video",
|
||||
"wan2.1_fun_control"
|
||||
],
|
||||
"upscaling": [
|
||||
"api_topaz_image_enhance",
|
||||
"api_topaz_video_enhance",
|
||||
"api_wavespeed_flshvsr_video_upscale",
|
||||
"api_wavespped_image_upscale",
|
||||
"api_wavespped_seedvr2_ai_image_fix",
|
||||
"ultility_hitpaw_general_image_enhance",
|
||||
"ultility_hitpaw_video_enhance",
|
||||
"utility-gan_upscaler",
|
||||
"utility-topaz_landscape_upscaler",
|
||||
"utility_interpolation_image_upscale",
|
||||
"utility_nanobanana_pro_ai_image_fix",
|
||||
"utility_nanobanana_pro_illustration_upscale",
|
||||
"utility_nanobanana_pro_product_upscale",
|
||||
"utility_recraft_creative_image_upscale",
|
||||
"utility_recraft_crisp_image_upscale",
|
||||
"utility_seedvr2_image_upscale",
|
||||
"utility_seedvr2_video_upscale",
|
||||
"utility_topaz_illustration_upscale",
|
||||
"utility_video_upscale"
|
||||
],
|
||||
"video-generation": [
|
||||
"03_video_wan2_2_14B_i2v_subgraphed",
|
||||
"api_bytedace_seedance1_5_flf2v",
|
||||
"api_bytedace_seedance1_5_image_to_video",
|
||||
"api_bytedace_seedance1_5_text_to_video",
|
||||
"api_bytedance_flf2v",
|
||||
"api_bytedance_image_to_video",
|
||||
"api_bytedance_text_to_video",
|
||||
"api_grok_video",
|
||||
"api_grok_video_edit",
|
||||
"api_hailuo_minimax_i2v",
|
||||
"api_hailuo_minimax_t2v",
|
||||
"api_hailuo_minimax_video",
|
||||
"api_kling2_6_i2v",
|
||||
"api_kling2_6_t2v",
|
||||
"api_kling_effects",
|
||||
"api_kling_flf",
|
||||
"api_kling_i2v",
|
||||
"api_kling_motion_control",
|
||||
"api_kling_omni_edit_video",
|
||||
"api_kling_omni_i2v",
|
||||
"api_kling_omni_t2v",
|
||||
"api_kling_omni_v2v",
|
||||
"api_ltxv_image_to_video",
|
||||
"api_ltxv_text_to_video",
|
||||
"api_luma_i2v",
|
||||
"api_luma_t2v",
|
||||
"api_moonvalley_image_to_video",
|
||||
"api_moonvalley_text_to_video",
|
||||
"api_moonvalley_video_to_video_motion_transfer",
|
||||
"api_moonvalley_video_to_video_pose_control",
|
||||
"api_openai_sora_video",
|
||||
"api_pixverse_i2v",
|
||||
"api_pixverse_t2v",
|
||||
"api_pixverse_template_i2v",
|
||||
"api_runway_first_last_frame",
|
||||
"api_runway_gen3a_turbo_image_to_video",
|
||||
"api_runway_gen4_turo_image_to_video",
|
||||
"api_topaz_video_enhance",
|
||||
"api_veo2_i2v",
|
||||
"api_veo3",
|
||||
"api_vidu_image_to_video",
|
||||
"api_vidu_q2_flf2v",
|
||||
"api_vidu_q2_i2v",
|
||||
"api_vidu_q2_r2v",
|
||||
"api_vidu_q2_t2v",
|
||||
"api_vidu_q3_image_to_video",
|
||||
"api_vidu_q3_text_to_video",
|
||||
"api_vidu_reference_to_video",
|
||||
"api_vidu_start_end_to_video",
|
||||
"api_vidu_text_to_video",
|
||||
"api_vidu_video_extension",
|
||||
"api_wan2_6_i2v",
|
||||
"api_wan2_6_t2v",
|
||||
"api_wan_image_to_video",
|
||||
"api_wan_r2v",
|
||||
"api_wan_text_to_video",
|
||||
"api_wavespeed_flshvsr_video_upscale",
|
||||
"gsc_starter_2",
|
||||
"hunyuan_video_text_to_video",
|
||||
"image_to_video_wan",
|
||||
"ltxv_image_to_video",
|
||||
"ltxv_text_to_video",
|
||||
"template-Animation_Trajectory_Control_Wan_ATI",
|
||||
"templates-3D_logo_texture_animation",
|
||||
"templates-6-key-frames",
|
||||
"templates-car_product",
|
||||
"templates-photo_to_product_vid",
|
||||
"templates-sprite_sheet",
|
||||
"templates-stitched_vid_contact_sheet",
|
||||
"templates-textured_logo_elements",
|
||||
"templates-textured_logotype-v2.1",
|
||||
"text_to_video_wan",
|
||||
"txt_to_image_to_video",
|
||||
"ultility_hitpaw_video_enhance",
|
||||
"utility-depthAnything-v2-relative-video",
|
||||
"utility-frame_interpolation-film",
|
||||
"utility-gan_upscaler",
|
||||
"utility-lineart-video",
|
||||
"utility-normal_crafter-video",
|
||||
"utility-openpose-video",
|
||||
"utility_seedvr2_video_upscale",
|
||||
"utility_video_upscale",
|
||||
"video-wan21_scail",
|
||||
"video_humo",
|
||||
"video_hunyuan_video_1.5_720p_i2v",
|
||||
"video_hunyuan_video_1.5_720p_t2v",
|
||||
"video_kandinsky5_i2v",
|
||||
"video_kandinsky5_t2v",
|
||||
"video_ltx2_canny_to_video",
|
||||
"video_ltx2_depth_to_video",
|
||||
"video_ltx2_i2v",
|
||||
"video_ltx2_i2v_distilled",
|
||||
"video_ltx2_pose_to_video",
|
||||
"video_ltx2_t2v",
|
||||
"video_ltx2_t2v_distilled",
|
||||
"video_wan2.1_alpha_t2v_14B",
|
||||
"video_wan2.1_fun_camera_v1.1_1.3B",
|
||||
"video_wan2.1_fun_camera_v1.1_14B",
|
||||
"video_wan2_1_infinitetalk",
|
||||
"video_wan2_2_14B_animate",
|
||||
"video_wan2_2_14B_flf2v",
|
||||
"video_wan2_2_14B_fun_camera",
|
||||
"video_wan2_2_14B_fun_control",
|
||||
"video_wan2_2_14B_fun_inpaint",
|
||||
"video_wan2_2_14B_i2v",
|
||||
"video_wan2_2_14B_s2v",
|
||||
"video_wan2_2_14B_t2v",
|
||||
"video_wan2_2_5B_fun_control",
|
||||
"video_wan2_2_5B_fun_inpaint",
|
||||
"video_wan2_2_5B_ti2v",
|
||||
"video_wan_ati",
|
||||
"video_wan_vace_14B_ref2v",
|
||||
"video_wan_vace_14B_t2v",
|
||||
"video_wan_vace_14B_v2v",
|
||||
"video_wan_vace_flf2v",
|
||||
"video_wan_vace_inpainting",
|
||||
"video_wan_vace_outpainting",
|
||||
"video_wanmove_480p",
|
||||
"video_wanmove_480p_hallucination",
|
||||
"wan2.1_flf2v_720_f16",
|
||||
"wan2.1_fun_control",
|
||||
"wan2.1_fun_inp"
|
||||
],
|
||||
"audio-generation": [
|
||||
"05_audio_ace_step_1_t2a_song_subgraphed",
|
||||
"api_kling2_6_i2v",
|
||||
"api_kling2_6_t2v",
|
||||
"api_stability_ai_audio_inpaint",
|
||||
"api_stability_ai_audio_to_audio",
|
||||
"api_stability_ai_text_to_audio",
|
||||
"api_vidu_q3_image_to_video",
|
||||
"api_vidu_q3_text_to_video",
|
||||
"audio-chatterbox_tts",
|
||||
"audio-chatterbox_tts_dialog",
|
||||
"audio-chatterbox_tts_multilingual",
|
||||
"audio-chatterbox_vc",
|
||||
"audio_ace_step_1_5_checkpoint",
|
||||
"audio_ace_step_1_5_split",
|
||||
"audio_ace_step_1_5_split_4b",
|
||||
"audio_ace_step_1_m2m_editing",
|
||||
"audio_ace_step_1_t2a_instrumentals",
|
||||
"audio_ace_step_1_t2a_song",
|
||||
"audio_stable_audio_example",
|
||||
"utility-audioseparation",
|
||||
"video_wan2_1_infinitetalk",
|
||||
"video_wan2_2_14B_s2v"
|
||||
],
|
||||
"3d-generation": [
|
||||
"04_hunyuan_3d_2.1_subgraphed",
|
||||
"3d_hunyuan3d-v2.1",
|
||||
"3d_hunyuan3d_image_to_model",
|
||||
"3d_hunyuan3d_multiview_to_model",
|
||||
"3d_hunyuan3d_multiview_to_model_turbo",
|
||||
"api_from_photo_2_miniature",
|
||||
"api_hunyuan3d_image_to_model",
|
||||
"api_hunyuan3d_text_to_model",
|
||||
"api_meshy_image_to_model",
|
||||
"api_meshy_multi_image_to_model",
|
||||
"api_meshy_text_to_model",
|
||||
"api_rodin_gen2",
|
||||
"api_rodin_image_to_model",
|
||||
"api_rodin_multiview_to_model",
|
||||
"api_tripo3_0_image_to_model",
|
||||
"api_tripo3_0_text_to_model",
|
||||
"api_tripo_image_to_model",
|
||||
"api_tripo_multiview_to_model",
|
||||
"api_tripo_text_to_model",
|
||||
"templates-3D_logo_texture_animation",
|
||||
"templates-qwen_multiangle"
|
||||
],
|
||||
"lora": [
|
||||
"flux_depth_lora_example",
|
||||
"image-qwen_image_edit_2511_lora_inflation",
|
||||
"image_qwen_image_2512_with_2steps_lora",
|
||||
"image_qwen_image_union_control_lora"
|
||||
],
|
||||
"embeddings": [],
|
||||
"ip-adapter": [
|
||||
"api_kling_omni_i2v",
|
||||
"api_kling_omni_image",
|
||||
"api_kling_omni_v2v",
|
||||
"api_magnific_image_style_transfer",
|
||||
"api_recraft_style_reference",
|
||||
"api_vidu_q2_r2v",
|
||||
"api_wan_r2v",
|
||||
"templates-product_ad-v2.0"
|
||||
],
|
||||
"samplers": [],
|
||||
"cfg": [],
|
||||
"vae": []
|
||||
}
|
||||
22
apps/hub/knowledge/concepts/audio-generation.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Audio Generation
|
||||
|
||||
Audio generation in ComfyUI covers creating speech (text-to-speech), music, and sound effects from text prompts or reference audio. Dedicated audio models run within ComfyUI's node graph, letting you integrate audio creation into larger multimedia workflows — for example, generating a video and its soundtrack in a single pipeline.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: Model-specific nodes (`CosyVoice` nodes for TTS, `StableAudio` nodes for music/SFX), audio preview and save nodes, `AudioScheduler`
|
||||
- Typical workflow pattern: Load audio model → Provide text/reference input → Generate audio → Preview/save audio
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Sample rate**: Output audio quality, typically 24000–48000 Hz. Higher rates capture more detail but produce larger files.
|
||||
- **Duration**: Length of generated audio in seconds. Longer durations may reduce quality or coherence depending on the model.
|
||||
- **Voice reference**: For voice cloning, a short audio clip of the target voice (3–10 seconds of clean speech works best).
|
||||
- **Text input**: The text to be spoken (TTS) or the description of the desired audio (music/SFX generation).
|
||||
|
||||
## Tips
|
||||
|
||||
- CosyVoice and F5-TTS are popular choices for text-to-speech in ComfyUI, each with dedicated custom nodes.
|
||||
- Stable Audio Open handles music and sound effect generation from text descriptions.
|
||||
- Use clean, noise-free reference audio clips for voice cloning to get the best results.
|
||||
- Keep text inputs short and well-punctuated for the highest quality speech output — long paragraphs may degrade in naturalness.
|
||||
23
apps/hub/knowledge/concepts/cfg.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# CFG / Guidance Scale
|
||||
|
||||
Classifier-Free Guidance (CFG) controls how strongly the model follows your text prompt versus generating freely. Higher CFG values produce outputs that adhere more closely to the prompt but can cause oversaturation and artifacts, while lower values yield more natural-looking images at the cost of reduced prompt control. Finding the right balance is essential for every workflow.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes: `KSampler` (the `cfg` parameter), `ModelSamplingDiscrete` (for advanced noise schedule configurations)
|
||||
- During each sampling step, the model generates both a conditioned prediction (with your prompt) and an unconditioned prediction (without it). CFG scales the difference between the two — higher values push the output further toward the conditioned prediction, amplifying prompt influence.
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **cfg** (1.0–30.0): The guidance scale value. Recommended ranges vary by model architecture:
|
||||
- SD 1.5 / SDXL: 7–8 is the standard starting point
|
||||
- Flux: 1.0–4.0 (Flux uses much lower guidance)
|
||||
- Video models (e.g., Wan, HunyuanVideo): 3.5–5.0
|
||||
|
||||
## Tips
|
||||
|
||||
- Start at 7 for SD-based models and 3.5 for Flux, then adjust based on results
|
||||
- Values above ~12 for SD models typically cause color oversaturation, harsh contrast, and visible artifacts
|
||||
- Values below ~3 for SD models tend to produce blurry or incoherent results
|
||||
- Some models like Flux Schnell use a guidance embedding baked into the model rather than traditional CFG — for these, the `cfg` parameter may have little or no effect
|
||||
- When experimenting, change CFG in increments of 0.5–1.0 to see its impact clearly
|
||||
28
apps/hub/knowledge/concepts/controlnet.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# ControlNet
|
||||
|
||||
ControlNet guides image generation using structural conditions extracted from reference images — such as edge maps, depth information, or human poses. Instead of relying solely on text prompts for composition, ControlNet lets you specify the spatial layout precisely. This bridges the gap between text-to-image flexibility and the structural precision needed for professional workflows.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `ControlNetLoader`, `ControlNetApplyAdvanced`, preprocessor nodes (`CannyEdgePreprocessor`, `DepthAnythingPreprocessor`, `DWPosePreprocessor`, `LineartPreprocessor`)
|
||||
- Typical workflow pattern: Load reference image → preprocess to extract condition (edges/depth/pose) → load ControlNet model → apply condition to sampling → generate image with structural guidance
|
||||
|
||||
## ControlNet Types
|
||||
|
||||
- **Canny**: Detects edges to preserve outlines and shapes
|
||||
- **Depth**: Captures spatial depth for accurate foreground/background placement
|
||||
- **OpenPose**: Extracts human body and hand poses for character positioning
|
||||
- **Normal Map**: Encodes surface orientation for consistent lighting and geometry
|
||||
- **Lineart**: Follows line drawings and illustrations as generation guides
|
||||
- **Scribble**: Uses rough sketches as loose compositional guides
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Strength**: Controls how strongly the condition guides generation (0.0–1.0). Values of 0.5–1.0 are typical. Higher values enforce the structure more rigidly; lower values allow the model more creative freedom.
|
||||
- **start_percent / end_percent**: Controls when the ControlNet activates during the sampling process. Starting at 0.0 and ending at 1.0 applies guidance throughout. Ending earlier (e.g., 0.8) lets the model refine fine details freely in final steps.
|
||||
|
||||
## Tips
|
||||
|
||||
- Always preprocess your input image with the appropriate preprocessor node before feeding it to ControlNet. Raw images will not produce correct conditioning.
|
||||
- Combine multiple ControlNets for precise control — for example, Depth for spatial layout plus OpenPose for character positioning. Stack them by chaining `ControlNetApplyAdvanced` nodes.
|
||||
- If your generation looks distorted or overcooked, lower the ControlNet strength. Values above 0.8 can fight with the text prompt and produce artifacts.
|
||||
19
apps/hub/knowledge/concepts/embeddings.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Textual Embeddings
|
||||
|
||||
Textual embeddings are learned text representations that encode specific concepts, styles, or objects into the CLIP text encoder's vocabulary. These tiny files (~10–100 KB) effectively add new "words" to your prompt vocabulary, letting you reference complex visual concepts — a particular art style, a specific character, or a set of undesirable artifacts — with a single token. Because they operate at the text-encoding level, embeddings integrate seamlessly with your existing prompts and require no changes to the model itself.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes: `CLIPTextEncode` — reference embeddings directly in your prompt text using the syntax `embedding:name_of_embedding`
|
||||
- Typical workflow pattern: Place embedding files in `ComfyUI/models/embeddings/` → type `embedding:name_of_embedding` inside your positive or negative prompt in a `CLIPTextEncode` node → connect to sampler as usual
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Prompt weighting**: Embeddings have no dedicated strength slider, but you can adjust their influence with prompt weighting syntax, e.g., `(embedding:name_of_embedding:1.2)` to increase strength or `(embedding:name_of_embedding:0.6)` to soften it
|
||||
- **Placement**: Add embeddings to the negative prompt to suppress unwanted features, or to the positive prompt to invoke a learned concept
|
||||
|
||||
## Tips
|
||||
|
||||
- Embeddings are commonly used in negative prompts (e.g., `embedding:EasyNegative`, `embedding:bad-hands-5`) to reduce common artifacts like malformed hands or distorted faces
|
||||
- Make sure the embedding matches your base model version — an SD 1.5 embedding will not work correctly with an SDXL checkpoint
|
||||
- You can combine multiple embeddings with regular text in the same prompt for fine-grained control
|
||||
20
apps/hub/knowledge/concepts/img2img.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Image-to-Image
|
||||
|
||||
Image-to-image (img2img) transforms an existing image using a text prompt while preserving the original structure and composition. Instead of starting from pure noise, the source image is encoded into latent space and partially noised, then the sampler denoises it guided by your prompt. This lets you restyle photos, refine AI-generated images, or apply creative modifications while keeping the overall layout intact.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `LoadImage`, `VAEEncode`, `CLIPTextEncode` (positive + negative), `KSampler`, `VAEDecode`, `SaveImage`
|
||||
- Typical workflow pattern: Load source image → encode to latent with VAE → encode text prompts → sample with partial denoise → decode latent to image → save
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Denoise Strength**: The most important setting, ranging from 0.0 to 1.0. Lower values (0.2–0.4) preserve more of the original image with subtle changes. Higher values (0.6–0.8) allow more creative freedom but deviate further from the source. A value of 1.0 effectively ignores the input image entirely.
|
||||
- **Steps**: Number of sampling steps. 20–30 is typical. Fewer steps may be sufficient at low denoise values since less transformation is needed.
|
||||
- **CFG Scale**: Controls prompt adherence, same as text-to-image. 7–8 is a standard starting point.
|
||||
|
||||
## Tips
|
||||
|
||||
- Start with a denoise strength of 0.5 and adjust up or down based on how much change you want. This gives a balanced mix of original structure and new content.
|
||||
- Your input image resolution should match the model's training resolution. Resize or crop your source image to 512×512 (SD 1.5) or 1024×1024 (SDXL) before loading to avoid quality issues.
|
||||
- Use img2img iteratively: generate an initial text-to-image result, then refine it with img2img at low denoise to fix details without losing the overall composition.
|
||||
21
apps/hub/knowledge/concepts/inpainting.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Inpainting
|
||||
|
||||
Inpainting selectively regenerates parts of an image using a mask while leaving the rest untouched. You paint a mask over the area you want to change, provide a text prompt describing the desired replacement, and the model fills in only the masked region. This is essential for fixing defects, replacing objects, or refining specific details in an otherwise finished image.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `LoadImage`, `VAEEncodeForInpainting`, `CLIPTextEncode` (positive + negative), `KSampler`, `VAEDecode`, `SaveImage`
|
||||
- Typical workflow pattern: Load image + mask → encode with inpainting-aware VAE node → encode text prompts → sample → decode → save
|
||||
- The mask can be created using ComfyUI's built-in mask editor or loaded from an external image
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **grow_mask_by**: Expands the mask by a number of pixels, helping the regenerated area blend smoothly with the surrounding image. 6–8 pixels is typical. Too little causes visible seams; too much affects areas you wanted to keep.
|
||||
- **Denoise Strength**: For inpainting, higher values (0.7–1.0) generally work best since you want the masked region to be fully regenerated. Lower values may produce inconsistent blending.
|
||||
- **Checkpoint**: Dedicated inpainting models like `512-inpainting-ema` produce significantly better edge blending than standard checkpoints.
|
||||
|
||||
## Tips
|
||||
|
||||
- Always expand your mask slightly beyond the target area. Tight masks create hard edges that look unnatural against the surrounding image.
|
||||
- Describe what you want to appear in the masked region, not what you want to remove. For example, prompt "a clear blue sky" rather than "remove the bird."
|
||||
- Use inpainting-specific checkpoints whenever possible. Standard models can inpaint but often struggle with seamless blending at mask boundaries.
|
||||
21
apps/hub/knowledge/concepts/ip-adapter.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# IP-Adapter
|
||||
|
||||
IP-Adapter (Image Prompt Adapter) uses reference images to guide generation style, composition, or subject instead of — or alongside — text prompts. Rather than describing what you want in words, you show the model an image, enabling "image prompting." This is especially powerful for transferring artistic style, maintaining character consistency across generations, or conveying visual concepts that are difficult to express in text.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes: `IPAdapterModelLoader`, `IPAdapterApply` (or `IPAdapterAdvanced`), `CLIPVisionLoader`, `CLIPVisionEncode`, `PrepImageForClipVision`
|
||||
- Typical workflow pattern: Load IP-Adapter model + CLIP Vision model → prepare and encode reference image → apply adapter to the main model → connect to sampler → decode
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **weight** (0.0–1.0): Controls the influence of the reference image on the output. A range of 0.5–0.8 is typical; higher values make the output closer to the reference
|
||||
- **weight_type**: Determines how the reference is interpreted — `standard` for general use, `style transfer` for artistic style without copying content, `composition` for layout guidance
|
||||
- **start_at / end_at** (0.0–1.0): Controls when the adapter is active during sampling. Limiting the range (e.g., 0.0–0.8) can improve prompt responsiveness while retaining reference influence
|
||||
|
||||
## Tips
|
||||
|
||||
- Use the `style_transfer` weight type when you want to borrow an artistic style without reproducing the reference image's content
|
||||
- Combine IP-Adapter with a text prompt for the best results — the text adds detail and specificity on top of the visual guidance
|
||||
- Face-specific IP-Adapter models (e.g., `ip-adapter-faceid`) exist for portrait consistency across multiple generations
|
||||
- Lower the weight if your output looks too similar to the reference image
|
||||
20
apps/hub/knowledge/concepts/lora.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# LoRA
|
||||
|
||||
LoRA (Low-Rank Adaptation) is a technique for fine-tuning a base model's behavior using a small add-on file rather than retraining the entire model. LoRAs adjust a model's style, teach it specific subjects, or introduce new concepts — all in a file typically just 10–200 MB, compared to multi-gigabyte full checkpoints. This makes them easy to share, swap, and combine. In ComfyUI, you load LoRAs on top of a checkpoint and control how strongly they influence the output.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `LoraLoader` (loads one LoRA and applies it to both MODEL and CLIP), `LoraLoaderModelOnly` (applies to MODEL only, skipping CLIP for faster loading)
|
||||
- Typical workflow pattern: Load checkpoint → LoraLoader (attach LoRA) → CLIP Text Encode → KSampler → VAE Decode. Chain multiple `LoraLoader` nodes to stack LoRAs.
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **strength_model**: Controls how much the LoRA affects the diffusion model. Range 0.0–1.0; typical values are 0.6–1.0. Higher values apply the LoRA effect more strongly.
|
||||
- **strength_clip**: Controls how much the LoRA affects text encoding. Usually set to the same value as strength_model, but can be adjusted independently for fine control.
|
||||
|
||||
## Tips
|
||||
|
||||
- Start with strength 0.7 and adjust up or down based on results — too high can cause oversaturation or artifacts.
|
||||
- Stacking too many LoRAs simultaneously can cause visual artifacts or conflicting styles; two or three is usually a safe limit.
|
||||
- Ensure the LoRA matches your base model architecture — SD 1.5 LoRAs will not work with SDXL checkpoints, and vice versa.
|
||||
- Many LoRAs require specific trigger words in your prompt to activate; always check the LoRA's documentation or model card.
|
||||
20
apps/hub/knowledge/concepts/outpainting.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Outpainting
|
||||
|
||||
Outpainting extends an image beyond its original borders, generating new content that seamlessly continues the existing scene. Unlike inpainting which replaces content within an image, outpainting adds content outside the frame — expanding the canvas in any direction. This is useful for changing aspect ratios, adding environmental context, or creating panoramic compositions from a single image.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `LoadImage`, `ImagePadForOutpaint`, `VAEEncodeForInpainting`, `CLIPTextEncode` (positive + negative), `KSampler`, `VAEDecode`, `SaveImage`
|
||||
- Typical workflow pattern: Load image → pad image with transparent/noised borders → encode with inpainting VAE node (padded area becomes the mask) → encode text prompts → sample → decode → save
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Padding Pixels**: The number of pixels to extend on each side, typically 64–256. Smaller increments produce more coherent results since the model has more context relative to the new area.
|
||||
- **Denoise Strength**: Use high values (0.8–1.0) for outpainted regions since the padded area is essentially blank and needs full generation.
|
||||
- **Feathering**: Controls the gradient blend between the original image and the new content. Higher feathering values create smoother transitions and reduce visible seams.
|
||||
|
||||
## Tips
|
||||
|
||||
- Outpaint in stages rather than all at once. Extending by 128 pixels at a time and iterating produces far more coherent results than trying to add 512 pixels in a single pass.
|
||||
- Use a lower CFG scale (5–6) for outpainting. This allows the model to generate more natural, context-aware extensions rather than forcing strict prompt adherence that may clash with the existing image.
|
||||
- Include scene context in your prompt that matches the original image. If the source shows an indoor room, describe the room's style and lighting so the extension feels continuous.
|
||||
21
apps/hub/knowledge/concepts/samplers.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Samplers & Schedulers
|
||||
|
||||
Samplers are the algorithms that iteratively denoise a random latent into a coherent image, while schedulers control the noise schedule — how much noise is removed at each step. Together they determine the image's quality, speed, and visual character. Choosing the right combination is one of the most impactful decisions in any generation workflow.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes: `KSampler` (main sampling node), `KSamplerAdvanced` (provides control over start/end steps for multi-pass workflows)
|
||||
- Typical workflow pattern: Load model → connect conditioning → configure sampler/scheduler/steps → sample → decode
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **sampler_name**: The denoising algorithm. Common choices include `euler` (fast, good baseline), `euler_ancestral` (more creative variation), `dpmpp_2m` (balanced quality and speed), `dpmpp_2m_sde` (high quality, slightly slower), `dpmpp_3m_sde` (very high quality), and `uni_pc` (fast convergence)
|
||||
- **scheduler**: Controls the noise reduction curve. `normal` is linear, `karras` front-loads noise reduction for better detail, `exponential` and `sgm_uniform` (recommended for SDXL) are also available
|
||||
- **steps** (1–100): Number of denoising iterations. 20–30 is typical; more steps give diminishing returns. Flux and LCM models need far fewer (4–8 steps)
|
||||
|
||||
## Tips
|
||||
|
||||
- `euler` + `normal` is the safest starting combination for any model
|
||||
- `dpmpp_2m` + `karras` is a popular choice when you want higher quality with minimal speed cost
|
||||
- Ancestral samplers (`euler_ancestral`, any `_sde` variant) produce different results each run even with the same seed — useful for exploration, but not for reproducibility
|
||||
- Flux and LCM models converge much faster; using 20+ steps with them wastes time without improving quality
|
||||
21
apps/hub/knowledge/concepts/text-to-image.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Text-to-Image Generation
|
||||
|
||||
Text-to-image is the foundational workflow in ComfyUI: you provide a text description (prompt) and the system generates an image from scratch. This is the starting point for most generative AI art. A diffusion model iteratively denoises a random latent image, guided by your text prompt encoded through CLIP, to produce a coherent image matching your description.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `CheckpointLoaderSimple`, `CLIPTextEncode` (positive + negative), `EmptyLatentImage`, `KSampler`, `VAEDecode`, `SaveImage`
|
||||
- Typical workflow pattern: Load checkpoint → encode text prompts → create empty latent → sample → decode latent to image → save
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Resolution**: Must match the model's training resolution. Use 512×512 for SD 1.5, 1024×1024 for SDXL and Flux models. Mismatched resolutions produce artifacts like duplicated limbs or distorted compositions.
|
||||
- **Steps**: Number of denoising iterations. 20–30 steps is a good balance between quality and speed. More steps refine details but with diminishing returns beyond 30.
|
||||
- **CFG Scale**: Controls how strongly the sampler follows your prompt. 7–8 is the typical range. Higher values increase prompt adherence but can introduce oversaturation or artifacts.
|
||||
- **Seed**: Determines the initial random noise. A fixed seed produces reproducible results, which is useful for iterating on prompts while keeping composition consistent.
|
||||
|
||||
## Tips
|
||||
|
||||
- Start with simple, descriptive prompts before adding stylistic modifiers. Complex prompts can conflict and produce muddy results.
|
||||
- Use the negative prompt `CLIPTextEncode` to specify what you want to avoid (e.g., "blurry, low quality, deformed hands") — this significantly improves output quality.
|
||||
- Always match your `EmptyLatentImage` resolution to the model you loaded. A 768×768 image on an SD 1.5 checkpoint will produce noticeably worse results than 512×512.
|
||||
21
apps/hub/knowledge/concepts/upscaling.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Upscaling
|
||||
|
||||
Upscaling increases image resolution while adding detail, turning a small generated image into a large, sharp result. In ComfyUI, there are two main approaches: model-based upscaling, which uses trained AI models (like RealESRGAN or 4x-UltraSharp) to intelligently enlarge an image in one pass, and latent-based upscaling, which works in latent space with a KSampler to add new detail during the enlargement process. Model-based is faster, while latent-based offers more creative control.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: `UpscaleModelLoader`, `ImageUpscaleWithModel`, `ImageScaleBy`, `LatentUpscale`, `VAEDecodeTiled`
|
||||
- Typical workflow pattern: Generate image → Upscale model loader → ImageUpscaleWithModel → Save image (model-based), or Generate latent → LatentUpscale → KSampler (lower denoise) → VAEDecode → Save image (latent-based)
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Upscale model**: The AI model used for model-based upscaling. `RealESRGAN_x4plus` is a reliable general-purpose choice; `4x-UltraSharp` excels at photo-realistic detail.
|
||||
- **Scale factor**: How much to enlarge — 2x and 4x are typical. Higher factors increase VRAM usage significantly.
|
||||
- **tile_size**: For tiled decoding/encoding of very large images. Range 512–1024; smaller tiles use less VRAM but take longer.
|
||||
|
||||
## Tips
|
||||
|
||||
- Model-based upscaling is faster but less creative; latent upscaling paired with a KSampler adds genuinely new detail.
|
||||
- Use `VAEDecodeTiled` for very large images to avoid out-of-memory errors.
|
||||
- Chain two 2x upscales instead of one 4x for better overall quality.
|
||||
- When using latent upscaling, set KSampler denoise to 0.3–0.5 to add detail without changing the composition.
|
||||
20
apps/hub/knowledge/concepts/vae.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# VAE (Variational Autoencoder)
|
||||
|
||||
The VAE encodes pixel images into a compact latent representation and decodes latents back into pixel images. All diffusion in Stable Diffusion and Flux happens in latent space — the VAE is the bridge between the images you see and the mathematical space where the model actually works. Every generation workflow ends with a VAE decode step to produce a viewable image.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes: `VAEDecode` (latent → image), `VAEEncode` (image → latent), `VAEDecodeTiled` (for large images to avoid out-of-memory errors), `VAELoader` (load a standalone VAE file)
|
||||
- Typical workflow pattern: Most checkpoints include a built-in VAE, so the `VAEDecode` node can pull directly from the loaded checkpoint. To use a different VAE, add a `VAELoader` node and connect it to `VAEDecode` instead.
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **tile_size** (for `VAEDecodeTiled`): Size of each tile when decoding in chunks. Default is 512; reduce if you still encounter memory issues
|
||||
- **VAE choice**: VAE files are model-specific. Use `sdxl_vae.safetensors` for SDXL, `ae.safetensors` for Flux. Place files in `ComfyUI/models/vae/`
|
||||
|
||||
## Tips
|
||||
|
||||
- If colors look washed out or slightly off, try loading an external VAE — the VAE baked into a checkpoint is not always optimal, especially for community fine-tunes
|
||||
- Use `VAEDecodeTiled` for images larger than ~2048 px on either side to prevent out-of-memory crashes
|
||||
- SDXL and Flux each have their own VAE architecture — using the wrong one will produce corrupted output
|
||||
- When doing img2img or inpainting, the `VAEEncode` node converts your input image into the latent space the model expects
|
||||
22
apps/hub/knowledge/concepts/video-generation.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Video Generation
|
||||
|
||||
Video generation creates video content from text prompts (T2V), reference images (I2V), or existing video (V2V) using specialized video diffusion models. Unlike image generation, video models must maintain temporal coherence across frames, ensuring smooth motion and consistent subjects. ComfyUI supports several leading open-source video models including WAN 2.1 and HunyuanVideo, each with their own loader and latent nodes.
|
||||
|
||||
## How It Works in ComfyUI
|
||||
|
||||
- Key nodes involved: Model-specific loaders (e.g. `WAN` video nodes, `HunyuanVideo` nodes, `LTXVLoader`), `EmptyHunyuanLatentVideo` / `EmptyLTXVLatentVideo`, `KSampler`, `VHS_VideoCombine` (from Video Helper Suite)
|
||||
- Typical workflow pattern: Load video model → Create empty video latent → KSampler (with video-aware scheduling) → VAE decode → VHS_VideoCombine → Save video
|
||||
|
||||
## Key Settings
|
||||
|
||||
- **Frame count**: Number of frames to generate. Typically 16–81 frames depending on the model; more frames require more VRAM and time.
|
||||
- **Resolution**: Often 512×320 or 848×480 for T2V. Higher resolutions need significantly more resources.
|
||||
- **FPS**: Frames per second for output, typically 8–24. Higher FPS gives smoother motion but requires more frames for the same duration.
|
||||
- **Motion scale/strength**: Controls the amount of movement in the generated video. Lower values produce subtle motion; higher values produce more dynamic scenes.
|
||||
|
||||
## Tips
|
||||
|
||||
- Start with fewer frames and lower resolution to test your prompt and settings before committing to a full-quality render.
|
||||
- Image-to-video (I2V) typically gives better coherence than text-to-video (T2V) because the model has a visual anchor.
|
||||
- Video Helper Suite (VHS) nodes are essential for loading, previewing, and saving video — install this custom node pack first.
|
||||
- WAN 2.1 and HunyuanVideo are currently the leading open models for quality video generation in ComfyUI.
|
||||
88
apps/hub/knowledge/models/_aliases.json
Normal file
@@ -0,0 +1,88 @@
|
||||
{
|
||||
"Wan": "wan",
|
||||
"Wan2.1": "wan",
|
||||
"Wan2.2": "wan",
|
||||
"Wan2.5": "wan",
|
||||
"Wan2.6": "wan",
|
||||
"Wan-Move": "wan",
|
||||
"Motion Control": "wan",
|
||||
"Flux": "flux",
|
||||
"Flux.2": "flux",
|
||||
"Flux.2 Dev": "flux",
|
||||
"Flux.2 Klein": "flux",
|
||||
"Kontext": "flux",
|
||||
"BFL": "flux",
|
||||
"SDXL": "sdxl",
|
||||
"SD1.5": "sdxl",
|
||||
"Stability": "sdxl",
|
||||
"Reimagine": "sdxl",
|
||||
"SD3.5": "sd3-5",
|
||||
"SVD": "svd",
|
||||
"Stable Audio": "stable-audio",
|
||||
"Google": "gemini",
|
||||
"Google Gemini": "gemini",
|
||||
"Google Gemini Image": "gemini",
|
||||
"Gemini3 Pro Image Preview": "gemini",
|
||||
"Gemini-2.5-Flash": "gemini",
|
||||
"Veo": "veo",
|
||||
"Nano Banana Pro": "nano-banana-pro",
|
||||
"nano-banana": "nano-banana-pro",
|
||||
"OpenAI": "gpt-image-1",
|
||||
"GPT-Image-1": "gpt-image-1",
|
||||
"GPT-Image-1.5": "gpt-image-1",
|
||||
"Qwen": "qwen",
|
||||
"Qwen-Image": "qwen",
|
||||
"Qwen-Image-Edit": "qwen",
|
||||
"Qwen-Image-Layered": "qwen",
|
||||
"Qwen-Image 2512": "qwen",
|
||||
"Hunyuan Video": "hunyuan",
|
||||
"Hunyuan3D": "hunyuan",
|
||||
"Tencent": "hunyuan",
|
||||
"LTX-2": "ltx-video",
|
||||
"LTXV": "ltx-video",
|
||||
"Lightricks": "ltx-video",
|
||||
"ByteDance": "seedance",
|
||||
"Seedance": "seedance",
|
||||
"Seedream": "seedream",
|
||||
"Seedream 4.0": "seedream",
|
||||
"SeedVR2": "seedvr2",
|
||||
"Vidu": "vidu",
|
||||
"Vidu Q2": "vidu",
|
||||
"Vidu Q3": "vidu",
|
||||
"Kling": "kling",
|
||||
"Kling O1": "kling",
|
||||
"Kling2.6": "kling",
|
||||
"ACE-Step": "ace-step",
|
||||
"Chatter Box": "chatterbox",
|
||||
"Recraft": "recraft",
|
||||
"Runway": "runway",
|
||||
"Luma": "luma",
|
||||
"HiDream": "hidream",
|
||||
"Tripo": "tripo",
|
||||
"MiniMax": "minimax",
|
||||
"Z-Image-Turbo": "z-image",
|
||||
"Z-Image": "z-image",
|
||||
"Grok": "grok",
|
||||
"Moonvalley": "moonvalley",
|
||||
"Topaz": "topaz",
|
||||
"Kandinsky": "kandinsky",
|
||||
"OmniGen": "omnigen",
|
||||
"Magnific": "magnific",
|
||||
"PixVerse": "pixverse",
|
||||
"Meshy": "meshy",
|
||||
"Rodin": "rodin",
|
||||
"WaveSpeed": "wavespeed",
|
||||
"Chroma": "chroma",
|
||||
"BRIA": "bria",
|
||||
"HitPaw": "hitpaw",
|
||||
"NewBie": "newbie",
|
||||
"Ovis-Image": "ovis-image",
|
||||
"Ideogram": "ideogram",
|
||||
"Anima": "anima",
|
||||
"ChronoEdit": "chronoedit",
|
||||
"Nvidia": "chronoedit",
|
||||
"HuMo": "humo",
|
||||
"FlashVSR": "flashvsr",
|
||||
"Real-ESRGAN": "real-esrgan",
|
||||
"Depth Anything\u00a0v2": "depth-anything-v2"
|
||||
}
|
||||
47
apps/hub/knowledge/models/ace-step.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# ACE-Step
|
||||
|
||||
ACE-Step is a foundation model for music generation developed by ACE Studio and StepFun. It uses diffusion-based generation with a Deep Compression AutoEncoder (DCAE) and a lightweight linear transformer to achieve state-of-the-art speed and musical coherence.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### ACE-Step (3.5B)
|
||||
|
||||
- 3.5B parameter diffusion model
|
||||
- DCAE encoder with linear transformer conditioning
|
||||
- 27 or 60 inference steps recommended
|
||||
- Apache 2.0 license
|
||||
|
||||
## Key Features
|
||||
|
||||
- 15x faster than LLM-based baselines (20 seconds for a 4-minute song on A100)
|
||||
- Full-song generation with lyrics and structure
|
||||
- Duration control for variable-length output
|
||||
- Music remixing and style transfer
|
||||
- Lyrics editing and vocal synthesis
|
||||
- Supports 16+ languages including English, Chinese, Japanese, Korean, French, German, Spanish, and more
|
||||
- Text-to-music from natural language descriptions
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- RTX 3090: 12.76x real-time factor at 27 steps
|
||||
- RTX 4090: 34.48x real-time factor at 27 steps
|
||||
- NVIDIA A100: 27.27x real-time factor at 27 steps
|
||||
- Apple M2 Max: 2.27x real-time factor at 27 steps
|
||||
- Higher step counts (60) reduce speed by roughly half
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Original music generation from text descriptions
|
||||
- Song remixing and style transfer
|
||||
- Lyrics-based music creation
|
||||
- Multi-language vocal music generation
|
||||
- Rapid music prototyping for content creators
|
||||
- Background music and soundtrack generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: Inference steps (27 for speed, 60 for quality)
|
||||
- **duration**: Target audio length in seconds (up to ~5 minutes)
|
||||
- **lyrics**: Song lyrics text input for vocal generation
|
||||
- **prompt**: Natural language description of desired music style and mood
|
||||
- **seed**: Random seed for reproducible generation (results are seed-sensitive)
|
||||
46
apps/hub/knowledge/models/anima.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Anima
|
||||
|
||||
Anima is an API-based AI video generation platform that creates animated video content from text prompts, supporting character consistency and storyboard-driven workflows.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Anima Video Generation
|
||||
|
||||
- Cloud-based video generation service
|
||||
- Supports multiple underlying AI models (Runway, Kling, Minimax, Luma)
|
||||
- Integrated text, image, and audio generation pipeline
|
||||
|
||||
## Key Features
|
||||
|
||||
- AI character generation with persistent identity across scenes
|
||||
- Storyboard-based workflow: script to visual scenes with narration
|
||||
- Multi-model integration (GPT-4, Claude, Gemini for text; FLUX, MidJourney for images)
|
||||
- Voice generation via ElevenLabs integration
|
||||
- Music composition via Suno integration
|
||||
- Autopilot mode for fully automated video creation
|
||||
- Prompt enhancement for optimized output quality
|
||||
- Template library for rapid content creation
|
||||
- Scene-by-scene generation with character consistency
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- No local hardware required (cloud-based service)
|
||||
- Runs entirely through web API
|
||||
- Browser-based interface for interactive use
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Animated story series production
|
||||
- Movie trailer and concept video creation
|
||||
- Kids bedtime story animation
|
||||
- Lofi music video generation
|
||||
- Marketing and explainer video content
|
||||
- Storyboard visualization
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of the scene or story
|
||||
- **character**: Selected or generated character for identity consistency
|
||||
- **style**: Visual style preset (animation, cinematic, etc.)
|
||||
- **duration**: Target video length
|
||||
- **resolution**: Output video resolution
|
||||
48
apps/hub/knowledge/models/bria.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# BRIA AI
|
||||
|
||||
BRIA AI is an enterprise-focused visual generative AI platform that trains its models exclusively on licensed, ethically sourced data, ensuring commercially safe outputs with full IP indemnification.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### BRIA Fibo
|
||||
|
||||
- Flagship hyper-controllable text-to-image model
|
||||
- JSON-based control framework with 100+ disentangled visual attributes
|
||||
- Supports lighting, depth, color, composition, and camera control
|
||||
- Ideal for agentic workflows and enterprise-scale creative automation
|
||||
|
||||
### BRIA Text-to-Image Lite
|
||||
|
||||
- Fully private, self-hosted deployment of the Fibo pipeline
|
||||
- Designed for regulated industries requiring total data control
|
||||
- Runs on-premises with no external data transfer
|
||||
|
||||
## Key Features
|
||||
|
||||
- Trained on 100% licensed data from 20+ partners including Getty Images
|
||||
- Full IP indemnification for commercial use
|
||||
- Tri-layer content moderation for brand-safe outputs
|
||||
- Patented attribution engine compensating data owners by usage
|
||||
- ControlNet support for canny, depth, recoloring, and IP Adapter
|
||||
- Multilingual prompt support
|
||||
- Fine-tuning API for brand-specific customization
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud-hosted API available (no local GPU required)
|
||||
- Self-hosted Lite version supports deployment on AWS and Azure
|
||||
- Open-source weights available on Hugging Face for local inference
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Enterprise marketing and advertising content
|
||||
- E-commerce product photography
|
||||
- Brand-consistent visual asset generation
|
||||
- Storyboarding and concept art for media production
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired image
|
||||
- **style**: Photorealistic, illustrative, or custom styles
|
||||
- **guidance_methods**: ControlNet canny, depth, recoloring, IP Adapter
|
||||
- **resolution**: Multiple aspect ratios supported
|
||||
52
apps/hub/knowledge/models/chatterbox.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Chatterbox
|
||||
|
||||
Chatterbox is a family of state-of-the-art open-source text-to-speech models developed by Resemble AI, featuring zero-shot voice cloning and emotion control.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Chatterbox Turbo
|
||||
|
||||
- 350M parameters, single-step mel decoding for low latency
|
||||
- Paralinguistic tags for non-speech sounds ([laugh], [cough], [chuckle])
|
||||
- English only, optimized for voice agents and production use
|
||||
|
||||
### Chatterbox (Original)
|
||||
|
||||
- 500M parameter Llama backbone, English only
|
||||
- CFG and exaggeration control for emotion intensity
|
||||
|
||||
### Chatterbox Multilingual
|
||||
|
||||
- 500M parameters, 23 languages (Arabic, Chinese, French, German, Hindi, Japanese, Korean, Spanish, and more)
|
||||
- Zero-shot voice cloning across languages
|
||||
|
||||
## Key Features
|
||||
|
||||
- Zero-shot voice cloning from a few seconds of reference audio
|
||||
- Emotion exaggeration control (first open-source model with this feature)
|
||||
- Built-in PerTh neural watermarking for responsible AI
|
||||
- Sub-200ms latency for real-time applications
|
||||
- Trained on 500K hours of cleaned speech data
|
||||
- MIT license (free for commercial use)
|
||||
- Outperforms ElevenLabs in subjective evaluations
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: NVIDIA GPU with CUDA support
|
||||
- Turbo model requires less VRAM than original due to smaller architecture
|
||||
- Runs on consumer GPUs (RTX 3060 and above)
|
||||
- CPU inference possible but significantly slower
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Voice cloning for content creation
|
||||
- AI voice agents and assistants
|
||||
- Audiobook narration
|
||||
- Game and media dialogue generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **exaggeration**: Emotion intensity control (0.0 to 1.0, default 0.5)
|
||||
- **cfg_weight**: Classifier-free guidance weight (0.0 to 1.0, default 0.5)
|
||||
- **audio_prompt_path**: Path to reference audio clip for voice cloning
|
||||
- **language_id**: Language code for multilingual model (e.g., "fr", "zh", "ja")
|
||||
50
apps/hub/knowledge/models/chroma.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Chroma
|
||||
|
||||
Chroma is an open-source 8.9 billion parameter text-to-image model based on the FLUX.1-schnell architecture, developed by Lodestone Rock and the community. It is fully Apache 2.0 licensed.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Chroma
|
||||
|
||||
- 8.9B parameter model based on FLUX.1-schnell
|
||||
- Trained on a curated 5M sample dataset (from 20M candidates)
|
||||
- Apache 2.0 license for unrestricted use
|
||||
- Supports both tag-based and natural language prompting
|
||||
|
||||
### Chroma XL
|
||||
|
||||
- Experimental merge and fine-tune based on NoobAI-XL (SDXL architecture)
|
||||
- Low CFG (2.5-3.0) and low step count (8-12 steps)
|
||||
- Optimized for fast generation on consumer hardware
|
||||
|
||||
## Key Features
|
||||
|
||||
- Fully open-source with Apache 2.0 licensing
|
||||
- Diverse training data spanning anime, artistic, and photographic styles
|
||||
- Community-driven development with public training logs
|
||||
- Compatible with FLUX ecosystem (VAE, T5 text encoder)
|
||||
- ComfyUI workflow support
|
||||
- LoRA and fine-tuning compatible
|
||||
- GGUF quantized versions available for lower VRAM
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Base model: 24GB VRAM recommended (BF16)
|
||||
- Q8_0 quantized: ~13GB VRAM
|
||||
- Q4_0 quantized: ~7GB VRAM
|
||||
- Requires FLUX.1 VAE and T5 text encoder
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Open-source text-to-image generation
|
||||
- Artistic and stylized image creation
|
||||
- Community model fine-tuning and experimentation
|
||||
- LoRA training for custom styles and characters
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description or tag-based prompt
|
||||
- **steps**: Inference steps (15-30 recommended)
|
||||
- **cfg_scale**: Guidance scale (1-4, model uses low CFG)
|
||||
- **resolution**: Output resolution (1024x1024 default)
|
||||
- **guidance**: Flux-style guidance parameter (around 4)
|
||||
58
apps/hub/knowledge/models/chronoedit.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# ChronoEdit
|
||||
|
||||
ChronoEdit is an image editing framework by NVIDIA that reframes editing as a video generation task, using temporal reasoning to ensure physically plausible and consistent edits.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### ChronoEdit-14B
|
||||
|
||||
- Full 14 billion parameter model for maximum quality
|
||||
- Built on pretrained video diffusion model architecture
|
||||
- Requires ~34GB VRAM (38GB with temporal reasoning enabled)
|
||||
|
||||
### ChronoEdit-2B
|
||||
|
||||
- Compact 2 billion parameter variant for efficiency
|
||||
- Maintains core temporal reasoning capabilities
|
||||
- Lower VRAM requirements for broader hardware compatibility
|
||||
|
||||
### ChronoEdit-14B 8-Step Distilled LoRA
|
||||
|
||||
- Distilled variant requiring only 8 inference steps
|
||||
- Faster generation with minimal quality loss
|
||||
- Uses flow-shift 2.0 and guidance-scale 1.0
|
||||
|
||||
## Key Features
|
||||
|
||||
- Treats image editing as a video generation task for temporal consistency
|
||||
- Temporal reasoning tokens simulate intermediate editing trajectories
|
||||
- Ensures physically plausible edits (object interactions, lighting, shadows)
|
||||
- Two-stage pipeline: temporal reasoning stage followed by editing frame generation
|
||||
- Prompt enhancer integration for improved editing instructions
|
||||
- LoRA fine-tuning support via DiffSynth-Studio
|
||||
- Upscaler LoRA available for super-resolution editing
|
||||
- PaintBrush LoRA for sketch-to-object editing
|
||||
- Apache-2.0 license
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- 14B model: 34GB VRAM minimum (38GB with temporal reasoning)
|
||||
- 2B model: 12GB+ VRAM estimated
|
||||
- Supports model offloading to reduce peak VRAM
|
||||
- Linux only (not supported on Windows/macOS)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Physically consistent image editing (add/remove/modify objects)
|
||||
- World simulation for autonomous driving and robotics
|
||||
- Visualizing editing trajectories and reasoning
|
||||
- Image super-resolution via upscaler LoRA
|
||||
- Sketch-to-object conversion via PaintBrush LoRA
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of the desired edit
|
||||
- **num_inference_steps**: Denoising steps (default ~50, or 8 with distilled LoRA)
|
||||
- **guidance_scale**: Prompt adherence strength (default ~7.5, or 1.0 with distilled LoRA)
|
||||
- **flow_shift**: Flow matching shift parameter (2.0 for distilled LoRA)
|
||||
- **enable_temporal_reasoning**: Toggle temporal reasoning stage for better consistency
|
||||
60
apps/hub/knowledge/models/depth-anything-v2.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Depth Anything V2
|
||||
|
||||
Depth Anything V2 is a monocular depth estimation model trained on 595K synthetic labeled images and 62M+ real unlabeled images, providing robust relative depth maps from single images.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Depth-Anything-V2-Small
|
||||
|
||||
- Lightweight variant for fast inference
|
||||
- ViT-S (Small) encoder backbone
|
||||
- Suitable for real-time applications
|
||||
|
||||
### Depth-Anything-V2-Base
|
||||
|
||||
- Mid-range variant balancing speed and accuracy
|
||||
- ViT-B (Base) encoder backbone
|
||||
|
||||
### Depth-Anything-V2-Large
|
||||
|
||||
- High-accuracy variant for detailed depth maps
|
||||
- ViT-L (Large) encoder backbone with 256 output features
|
||||
- Recommended for most production use cases
|
||||
|
||||
### Depth-Anything-V2-Giant
|
||||
|
||||
- Maximum accuracy variant
|
||||
- ViT-G (Giant) encoder backbone
|
||||
- Highest computational requirements
|
||||
|
||||
## Key Features
|
||||
|
||||
- More fine-grained depth detail than Depth Anything V1
|
||||
- More robust than V1 and Stable Diffusion-based alternatives (Marigold, Geowizard)
|
||||
- 10× faster than SD-based depth estimation models
|
||||
- Trained on large-scale synthetic + real data mixture
|
||||
- Produces relative (not metric) depth maps by default
|
||||
- DPT (Dense Prediction Transformer) decoder architecture
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Small: 2GB VRAM minimum
|
||||
- Base: 4GB VRAM minimum
|
||||
- Large: 6GB VRAM recommended
|
||||
- Giant: 12GB+ VRAM recommended
|
||||
- CPU inference supported for smaller variants
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Depth map generation for compositing and VFX
|
||||
- ControlNet depth conditioning for image generation
|
||||
- 3D scene understanding and reconstruction
|
||||
- Foreground/background separation
|
||||
- Augmented reality occlusion
|
||||
- Video depth estimation for parallax effects
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **encoder**: Model size variant (vits, vitb, vitl, vitg)
|
||||
- **input_size**: Processing resolution (higher = more detail, more VRAM)
|
||||
- **output_type**: Raw depth array or normalized visualization
|
||||
50
apps/hub/knowledge/models/flashvsr.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# FlashVSR
|
||||
|
||||
FlashVSR is a diffusion-based streaming video super-resolution framework that achieves near real-time 4× upscaling through one-step inference with locality-constrained sparse attention.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### FlashVSR v1
|
||||
|
||||
- Initial release of the one-step streaming VSR model
|
||||
- Built on Wan2.1 1.3B video diffusion backbone
|
||||
- 4× super-resolution optimized
|
||||
|
||||
### FlashVSR v1.1
|
||||
|
||||
- Enhanced stability and fidelity over v1
|
||||
- Improved artifact handling across different aspect ratios
|
||||
- Recommended for production use
|
||||
|
||||
## Key Features
|
||||
|
||||
- One-step diffusion inference (no multi-step denoising required)
|
||||
- Streaming architecture with KV cache for sequential frame processing
|
||||
- Locality-Constrained Sparse Attention (LCSA) prevents artifacts at high resolutions
|
||||
- Tiny Conditional Decoder (TC Decoder) achieves 7× faster decoding than standard WanVAE
|
||||
- Three-stage distillation pipeline from multi-step to single-step inference
|
||||
- Runs at ~17 FPS for 768×1408 videos on a single A100 GPU
|
||||
- Up to 12× speedup over prior one-step diffusion VSR models
|
||||
- Scales reliably to ultra-high resolutions
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 24GB VRAM (A100 or similar recommended)
|
||||
- Optimized for NVIDIA A100 GPUs
|
||||
- Significant VRAM required for high-resolution video processing
|
||||
- Multi-GPU inference not required but beneficial for throughput
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Real-world video upscaling to 4K
|
||||
- AI-generated video enhancement and artifact removal
|
||||
- Long video super-resolution with temporal consistency
|
||||
- Streaming video quality improvement
|
||||
- Restoring compressed or low-resolution video footage
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **scale**: Upscaling factor (4× recommended for best results)
|
||||
- **tile_size**: Spatial tiling for memory management (0 = auto)
|
||||
- **input_resolution**: Source video resolution (outputs 4× larger)
|
||||
- **model_version**: v1 or v1.1 checkpoint selection
|
||||
98
apps/hub/knowledge/models/flux.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Flux
|
||||
|
||||
Flux is a family of state-of-the-art text-to-image and image editing models developed by Black Forest Labs (BFL).
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Flux.1 Schnell
|
||||
|
||||
- Ultra-fast inference (1-4 steps)
|
||||
- 12B parameter rectified flow transformer
|
||||
- Apache 2.0 license (open source)
|
||||
- Best for rapid prototyping and real-time applications
|
||||
|
||||
### Flux.1 Dev
|
||||
|
||||
- High-quality 12B parameter development model
|
||||
- 20-50 steps for best results
|
||||
- Non-commercial license for research
|
||||
- Guidance-distilled for efficient generation
|
||||
|
||||
### Flux.1 Pro
|
||||
|
||||
- Highest quality Flux.1 outputs via commercial API
|
||||
- Best prompt adherence and detail
|
||||
|
||||
### Flux.2 Dev
|
||||
|
||||
- 32B parameter rectified flow transformer
|
||||
- Unified text-to-image, single-reference editing, and multi-reference editing
|
||||
- No fine-tuning needed for character/object/style reference
|
||||
- Up to 4MP photorealistic output with improved autoencoder
|
||||
- Non-commercial license; quantized versions available for consumer GPUs
|
||||
|
||||
### Flux.2 Klein
|
||||
|
||||
- Fastest Flux model family — sub-second inference on modern hardware
|
||||
- **Klein 4B**: ~8GB VRAM, Apache 2.0 license, ideal for edge deployment
|
||||
- **Klein 9B**: Best quality-to-latency ratio, non-commercial license
|
||||
- Base (undistilled) variants available for fine-tuning and LoRA training
|
||||
- Supports text-to-image, single-reference editing, and multi-reference editing
|
||||
|
||||
### Flux.1 Kontext
|
||||
|
||||
- In-context image generation and editing via text instructions
|
||||
- Available as Kontext Max (premium), Pro (API), and Dev (open-weights, 12B)
|
||||
- Character consistency across multiple scenes without fine-tuning
|
||||
- Typography manipulation and local editing within images
|
||||
|
||||
### Flux.1 Fill
|
||||
|
||||
- Dedicated inpainting and outpainting model
|
||||
- Maintains consistency with surrounding image context
|
||||
- Available as Fill Pro (API) and Fill Dev (open-weights)
|
||||
|
||||
### Flux Redux / Canny / Depth
|
||||
|
||||
- **Redux**: Image variation generation from reference images
|
||||
- **Canny**: Edge-detection-based structural conditioning
|
||||
- **Depth**: Depth-map-based structural conditioning for pose/layout control
|
||||
|
||||
## Key Features
|
||||
|
||||
- Excellent text rendering in images
|
||||
- Strong prompt following and instruction adherence
|
||||
- High resolution output (up to 4MP with Flux.2)
|
||||
- Multi-reference editing: combine up to 6 reference images
|
||||
- Consistent style and quality across generations
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Flux.2 Klein 4B: ~8GB VRAM (consumer GPUs like RTX 4070)
|
||||
- Flux.2 Klein 9B: ~20GB VRAM
|
||||
- Flux.1 models: 12GB VRAM minimum (fp16), 24GB recommended
|
||||
- Flux.2 Dev: 64GB+ VRAM native, FP8 quantized ~40GB
|
||||
- Quantized and weight-streaming options available for lower VRAM cards
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Text-to-image generation
|
||||
- Iterative image editing via text instructions
|
||||
- Character-consistent multi-scene generation
|
||||
- Inpainting and outpainting
|
||||
- Style transfer and image variation
|
||||
- Structural conditioning (canny, depth)
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: 1-4 (Schnell/Klein distilled), 20-50 (Dev/Base)
|
||||
- **guidance_scale**: 3.5-4.0 typical for Flux.2, 3.5 for Flux.1
|
||||
- **resolution**: Up to 2048x2048 (Flux.1), up to 4MP (Flux.2)
|
||||
- **seed**: For reproducible generation
|
||||
- **prompt_upsampling**: Optional LLM-based prompt enhancement (Flux.2)
|
||||
|
||||
## Blog References
|
||||
|
||||
- [FLUX.2 Day-0 Support in ComfyUI](../blog/flux2-day-0-support.md) — FLUX.2 with 4MP output, multi-reference consistency, professional text rendering
|
||||
- [FLUX.2 [klein] 4B & 9B](../blog/flux2-klein-4b.md) — Fastest Flux models, sub-second inference, unified generation and editing
|
||||
- [The Complete AI Upscaling Handbook](../blog/upscaling-handbook.md) — Benchmarks for upscaling workflows
|
||||
1
apps/hub/knowledge/models/flux.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
Flux is Black Forest Labs' family of text-to-image and image editing models. The lineup includes Flux.1 Schnell (ultra-fast, 1-4 steps, Apache 2.0), Flux.1 Dev (high-quality, 20-50 steps, non-commercial), Flux.1 Pro (commercial API), and the newer Flux.2 Dev (32B parameters, up to 4MP output, multi-reference editing without fine-tuning). Flux.2 Klein offers sub-second inference in 4B (~8GB VRAM, Apache 2.0) and 9B variants. Specialized models include Kontext (in-context editing, character consistency), Fill (inpainting/outpainting), Redux (image variations), and Canny/Depth (structural conditioning). Flux excels at text rendering in images, strong prompt adherence, and consistent multi-scene generation. VRAM ranges from ~8GB (Klein 4B) to 64GB+ (Flux.2 Dev native), with quantized options available. Key parameters: guidance_scale 3.5-4.0, resolution up to 4MP for Flux.2. Primary uses include text-to-image, iterative editing, style transfer, and structural conditioning.
|
||||
75
apps/hub/knowledge/models/gemini.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Gemini
|
||||
|
||||
Gemini is Google DeepMind's multimodal AI model family with native image generation, editing, and video generation capabilities, accessible in ComfyUI through API nodes.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Gemini 3 Pro Image Preview
|
||||
|
||||
- Most capable Gemini image model with advanced reasoning
|
||||
- Complex multi-turn image generation and editing
|
||||
- Up to 14 input images, native 4K output
|
||||
- Also known as Nano Banana Pro
|
||||
- Model ID: `gemini-3-pro-image-preview`
|
||||
|
||||
### Gemini 2.5 Flash Image
|
||||
|
||||
- Cost-effective image generation optimized for speed and low latency
|
||||
- Character consistency, multi-image fusion, and prompt-based editing
|
||||
- $0.039 per image (1290 output tokens per image)
|
||||
- Model ID: `gemini-2.5-flash-image`
|
||||
|
||||
### Google Gemini (General)
|
||||
|
||||
- Multimodal model for text, image understanding, and generation
|
||||
- Interleaved text-and-image output in conversational context
|
||||
- Supports image input for analysis and editing tasks
|
||||
|
||||
### Veo 2
|
||||
|
||||
- Text-to-video and image-to-video generation
|
||||
- 8-second video clips at 720p resolution
|
||||
- Realistic physics simulation and cinematic styles
|
||||
- Supports 16:9 and 9:16 aspect ratios
|
||||
- Model ID: `veo-2.0-generate-001`
|
||||
|
||||
### Veo 3 / 3.1
|
||||
|
||||
- Latest video generation with native audio (dialogue, SFX, ambient)
|
||||
- Up to 1080p and 4K resolution (Veo 3.1)
|
||||
- Style reference images for aesthetic control
|
||||
- 4, 6, or 8-second video duration options
|
||||
|
||||
## Key Features
|
||||
|
||||
- Native multimodal generation: text, images, and video in one model family
|
||||
- World knowledge from Google Search for factually accurate image generation
|
||||
- SynthID invisible watermarking on all generated content
|
||||
- Multi-image fusion and character consistency across generations
|
||||
- Clean text rendering across multiple languages
|
||||
- Prompt-based image editing without masks or complex workflows
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- No local GPU required — all models accessed via cloud API
|
||||
- Available through ComfyUI API nodes, Google AI Studio, and Vertex AI
|
||||
- Requires API key and network access
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Text-to-image and image editing via API nodes
|
||||
- Multi-turn conversational image generation
|
||||
- Video generation from text prompts or reference images
|
||||
- Product animation and social media video content
|
||||
- Style-consistent character and brand asset generation
|
||||
- Text rendering and translation in images
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description for generation or editing
|
||||
- **aspect_ratio**: 1:1, 3:4, 4:3, 9:16, 16:9, 21:9 (images); 16:9, 9:16 (video)
|
||||
- **temperature**: 0.0-2.0 (default 1.0 for image models)
|
||||
- **durationSeconds**: 4-8 seconds for Veo models
|
||||
- **sampleCount**: 1-4 output videos per request
|
||||
- **seed**: Integer for reproducible generation
|
||||
- **personGeneration**: Safety control — `allow_adult`, `dont_allow`, or `allow_all`
|
||||
62
apps/hub/knowledge/models/gpt-image-1.md
Normal file
@@ -0,0 +1,62 @@
|
||||
# GPT-Image-1
|
||||
|
||||
GPT-Image-1 is OpenAI's natively multimodal image generation model, capable of generating and editing images from text and image inputs. It is accessed in ComfyUI through API nodes.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### GPT-Image-1.5
|
||||
|
||||
- Latest and most advanced GPT Image model
|
||||
- Best overall quality with superior instruction following
|
||||
- High input fidelity for the first 5 input images
|
||||
- Supports generate vs. edit action control
|
||||
- Multi-turn editing via the Responses API
|
||||
|
||||
### GPT-Image-1
|
||||
|
||||
- Production-grade image generation and editing
|
||||
- High input fidelity for the first input image
|
||||
- Supports up to 16 input images for editing
|
||||
- Up to 10 images per generation request
|
||||
|
||||
### GPT-Image-1-Mini
|
||||
|
||||
- Cost-effective variant for lower quality requirements
|
||||
- Same API surface as GPT-Image-1
|
||||
- Suitable for rapid prototyping and high-volume workloads
|
||||
|
||||
## Key Features
|
||||
|
||||
- Superior text rendering in generated images
|
||||
- Real-world knowledge for accurate depictions
|
||||
- Transparent background support (PNG and WebP)
|
||||
- Mask-based inpainting with prompt guidance
|
||||
- Multi-image editing: combine up to 16 reference images
|
||||
- Streaming partial image output during generation
|
||||
- Content moderation with adjustable strictness
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- No local GPU required — cloud API service via OpenAI
|
||||
- Accessed through ComfyUI API nodes
|
||||
- Requires OpenAI API key and organization verification
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Text-to-image generation with detailed prompts
|
||||
- Image editing and compositing from multiple references
|
||||
- Product photography and mockup generation
|
||||
- Inpainting with mask-guided editing
|
||||
- Transparent asset generation (stickers, logos, icons)
|
||||
- Multi-turn iterative image refinement
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description up to 32,000 characters
|
||||
- **size**: `1024x1024`, `1536x1024` (landscape), `1024x1536` (portrait), or `auto`
|
||||
- **quality**: `low`, `medium`, `high`, or `auto` (affects cost and detail)
|
||||
- **n**: Number of images to generate (1-10)
|
||||
- **background**: `transparent`, `opaque`, or `auto`
|
||||
- **output_format**: `png`, `jpeg`, or `webp`
|
||||
- **moderation**: `auto` (default) or `low` (less restrictive)
|
||||
- **input_fidelity**: `low` (default) or `high` for preserving input image details
|
||||
56
apps/hub/knowledge/models/grok.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Grok (Aurora)
|
||||
|
||||
Aurora is xAI's autoregressive image generation model integrated into Grok, excelling at photorealistic rendering and precise text instruction following.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### grok-2-image-1212
|
||||
|
||||
- API-accessible image generation model
|
||||
- Generates multiple images from text prompts
|
||||
- $0.07 per generated image
|
||||
- OpenAI and Anthropic SDK compatible
|
||||
|
||||
### Aurora (Consumer)
|
||||
|
||||
- Autoregressive mixture-of-experts network
|
||||
- Trained on billions of text and image examples
|
||||
- Available via Grok on X platform, web, iOS, and Android
|
||||
|
||||
### Grok Imagine
|
||||
|
||||
- Video and image generation model
|
||||
- State-of-the-art quality across cost and latency
|
||||
- API available since January 2026
|
||||
|
||||
## Key Features
|
||||
|
||||
- Photorealistic image generation from text prompts
|
||||
- Precise text rendering within images
|
||||
- Accurate rendering of real-world entities, logos, and text
|
||||
- Image editing via uploaded photos with text instructions
|
||||
- Multi-image generation per request
|
||||
- Native multimodal input support
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- All generation runs on xAI infrastructure
|
||||
- API access via console.x.ai
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Photorealistic image generation
|
||||
- Text and logo rendering in images
|
||||
- Image editing and style transfer
|
||||
- Meme and social media content creation
|
||||
- Product visualization
|
||||
- Character and portrait generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired image
|
||||
- **model**: Model identifier (grok-2-image-1212)
|
||||
- **n**: Number of images to generate
|
||||
- **response_format**: Output format (url or b64_json)
|
||||
- **size**: Image dimensions
|
||||
55
apps/hub/knowledge/models/hidream.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# HiDream-I1
|
||||
|
||||
HiDream-I1 is a 17B parameter image generation foundation model by HiDream.ai that achieves state-of-the-art quality using a sparse diffusion transformer architecture.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### HiDream-I1 Full
|
||||
|
||||
- Full 17B parameter sparse diffusion transformer
|
||||
- Uses Llama-3.1-8B-Instruct and T5-XXL as text encoders
|
||||
- VAE from FLUX.1 Schnell, MIT license
|
||||
|
||||
### HiDream-I1 Dev
|
||||
|
||||
- Distilled variant, faster inference with minor quality tradeoff
|
||||
|
||||
### HiDream-I1 Fast
|
||||
|
||||
- Further distilled for maximum speed, best for rapid prototyping
|
||||
|
||||
### HiDream-E1
|
||||
|
||||
- Instruction-based image editing model
|
||||
|
||||
## Key Features
|
||||
|
||||
- State-of-the-art HPS v2.1 score (33.82), surpassing Flux.1-dev, DALL-E 3, and Midjourney V6
|
||||
- Best-in-class prompt following on GenEval (0.83) and DPG-Bench (85.89)
|
||||
- Multiple output styles: photorealistic, cartoon, artistic, and more
|
||||
- Dual text encoding with Llama-3.1-8B-Instruct and T5-XXL for strong prompt adherence
|
||||
- MIT license for commercial use
|
||||
- Requires Flash Attention for optimal performance
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 24GB VRAM (Full model), Dev and Fast variants run on lower VRAM
|
||||
- Recommended: 40GB+ VRAM for Full model at high resolution
|
||||
- CUDA 12.4+ recommended for Flash Attention
|
||||
- Llama-3.1-8B-Instruct weights downloaded automatically
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- High-fidelity text-to-image generation
|
||||
- Photorealistic image creation
|
||||
- Artistic and stylized illustrations
|
||||
- Instruction-based image editing (E1 variant)
|
||||
- Commercial image generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **model_type**: Variant selection (full, dev, fast)
|
||||
- **steps**: Inference steps (varies by variant; fewer for fast/dev)
|
||||
- **cfg_scale**: Guidance scale for prompt adherence
|
||||
- **resolution**: Output image dimensions
|
||||
- **prompt**: Detailed text description of desired image
|
||||
51
apps/hub/knowledge/models/hitpaw.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# HitPaw
|
||||
|
||||
HitPaw is an AI-powered visual enhancement platform providing image and video upscaling, restoration, and denoising through dedicated API services and desktop applications.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### HitPaw Image Enhancer
|
||||
|
||||
- AI-powered photo enhancement with super-resolution up to 8x
|
||||
- Face Clear Model: dual-model portrait upscaling (2x and 4x)
|
||||
- Face Natural Model: texture-preserving portrait enhancement
|
||||
- General Enhance Model: super-resolution for scenes and objects
|
||||
- High Fidelity Model: premium upscaling for DSLR and AIGC images
|
||||
- Generative Portrait/Enhance Models: diffusion-based restoration for heavily compressed images
|
||||
|
||||
### HitPaw Video Enhancer (VikPea)
|
||||
|
||||
- Frame-aware video restoration and ultra HD upscaling
|
||||
- Face Soft Model: face-optimized noise and blur reduction
|
||||
- Portrait Restore Model: multi-frame fusion for facial detail
|
||||
- General Restore Model: GAN-based restoration for broad scenarios
|
||||
- Ultra HD Model: premium upscaling from HD to ultra HD
|
||||
- Generative Model: diffusion-driven repair for low-resolution video
|
||||
|
||||
## Key Features
|
||||
|
||||
- One-click portrait and scene enhancement
|
||||
- Dual-model face and background processing pipelines
|
||||
- Batch processing and API access for automated workflows
|
||||
- Support for 30+ video input formats and 5 export formats
|
||||
- Multi-frame face restoration for temporal consistency in video
|
||||
- Denoising models for mobile and camera images
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API available (no local GPU required)
|
||||
- Desktop apps for Windows, Mac, Android, and iOS
|
||||
- API integration via HTTP-based interface
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Upscaling AI-generated images to publication quality
|
||||
- Restoring old or low-resolution photos and videos
|
||||
- Enhancing portrait and landscape photography
|
||||
- Video quality improvement for content creators
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **model**: Select enhancement model per content type
|
||||
- **scale**: 2x or 4x super-resolution options
|
||||
- **format**: Output format selection (mp4, mov, mkv, m4v, avi for video)
|
||||
47
apps/hub/knowledge/models/humo.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# HuMo
|
||||
|
||||
HuMo is a human-centric video generation model by ByteDance that produces videos from collaborative multi-modal conditioning using text, image, and audio inputs.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### HuMo (Wan2.1-T2V-1.3B based)
|
||||
|
||||
- Built on the Wan2.1-T2V-1.3B video foundation model
|
||||
- Supports Text+Image (TI), Text+Audio (TA), and Text+Image+Audio (TIA) modes
|
||||
- Two-stage training: subject preservation then audio-visual sync
|
||||
|
||||
## Key Features
|
||||
|
||||
- Multi-modal conditioning: text, reference images, and audio simultaneously
|
||||
- Subject identity preservation from reference images across frames
|
||||
- Audio-driven lip synchronization with facial expression alignment
|
||||
- Focus-by-predicting strategy for facial region attention during audio sync
|
||||
- Time-adaptive guidance dynamically adjusts input weights across denoising steps
|
||||
- Minimal-invasive image injection maintains base model prompt understanding
|
||||
- Progressive two-stage training separates identity learning from audio sync
|
||||
- Supports text-controlled appearance editing while preserving identity
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 24GB VRAM (RTX 3090/4090 or similar)
|
||||
- Multi-GPU inference supported via FSDP and sequence parallelism
|
||||
- Whisper-large-v3 audio encoder required for audio modes
|
||||
- Optional audio separator for cleaner speech input
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Digital avatar and virtual presenter creation
|
||||
- Audio-driven talking head generation
|
||||
- Character-consistent video clips from reference photos
|
||||
- Lip-synced dialogue video from audio tracks
|
||||
- Prompted reenactment with identity preservation
|
||||
- Text-controlled outfit and style changes on consistent subjects
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **mode**: Generation mode (TI, TA, or TIA)
|
||||
- **scale_t**: Text guidance strength (default: 7.5)
|
||||
- **scale_a**: Audio guidance strength (default: 2.0)
|
||||
- **frames**: Number of output frames (97 at 25 FPS = ~4 seconds)
|
||||
- **height/width**: Output resolution (480p or 720p supported)
|
||||
- **steps**: Denoising steps (30-50 recommended)
|
||||
75
apps/hub/knowledge/models/hunyuan.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Hunyuan
|
||||
|
||||
Hunyuan is Tencent's family of open-source generative models spanning text-to-image, text-to-video, and 3D asset generation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Hunyuan-DiT
|
||||
|
||||
- Text-to-image diffusion transformer with native Chinese and English support
|
||||
- 1.5B parameter DiT architecture, native 1024x1024 resolution
|
||||
- Bilingual text encoder for strong CJK text rendering in images
|
||||
- v1.2 is the latest version with improved quality
|
||||
|
||||
### HunyuanVideo
|
||||
|
||||
- Large-scale text-to-video and image-to-video generation model
|
||||
- 13B+ parameters, the largest open-source video generation model
|
||||
- Dual-stream to single-stream transformer architecture with full attention
|
||||
- MLLM text encoder (decoder-only LLM) for better instruction following
|
||||
- Causal 3D VAE with 4x temporal, 8x spatial, 16x channel compression
|
||||
- Generates 720p video (1280x720) at up to 129 frames (~5s at 24fps)
|
||||
- FP8 quantized weights available to reduce memory by ~10GB
|
||||
- Outperforms Runway Gen-3, Luma 1.6 in professional evaluations
|
||||
- 3 workflow templates available
|
||||
|
||||
### Hunyuan3D 2.0
|
||||
|
||||
- Image-to-3D and text-to-3D asset generation system
|
||||
- Two-stage pipeline: Hunyuan3D-DiT (shape) + Hunyuan3D-Paint (texture)
|
||||
- Flow-based diffusion transformer for geometry generation
|
||||
- High-resolution texture synthesis with geometric and diffusion priors
|
||||
- Outputs textured meshes in GLB/OBJ format
|
||||
- Outperforms both open and closed-source 3D generation models
|
||||
- 7 workflow templates available
|
||||
|
||||
## Key Features
|
||||
|
||||
- Native bilingual support (Chinese and English) across the family
|
||||
- Strong text rendering in generated images (Hunyuan-DiT)
|
||||
- State-of-the-art video generation quality (HunyuanVideo)
|
||||
- End-to-end 3D asset creation with texturing (Hunyuan3D)
|
||||
- Multi-resolution generation across all model types
|
||||
- Prompt rewrite system for improved generation quality (HunyuanVideo)
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Hunyuan-DiT: 11GB VRAM minimum (fp16), 16GB recommended
|
||||
- HunyuanVideo 540p (544x960): 45GB VRAM minimum
|
||||
- HunyuanVideo 720p (720x1280): 60GB VRAM minimum, 80GB recommended
|
||||
- HunyuanVideo FP8: Saves ~10GB compared to fp16 weights
|
||||
- Hunyuan3D 2.0: 16-24GB VRAM for shape + texture pipeline
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Bilingual content creation and marketing materials
|
||||
- Asian-style artwork and illustrations
|
||||
- Text-in-image generation (Chinese/English)
|
||||
- High-quality video generation from text or image prompts
|
||||
- 3D asset creation for games, design, and prototyping
|
||||
- Textured mesh generation from reference images
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: 25-50 for Hunyuan-DiT (default 40), 50 for HunyuanVideo
|
||||
- **cfg_scale**: 5-8 for DiT (6 typical), 6.0 embedded for HunyuanVideo
|
||||
- **flow_shift**: 7.0 for HunyuanVideo flow matching scheduler
|
||||
- **video_length**: 129 frames for HunyuanVideo (~5s at 24fps)
|
||||
- **resolution**: 1024x1024 for DiT, 720x1280 or 544x960 for video
|
||||
- **negative_prompt**: Recommended for Hunyuan-DiT quality control
|
||||
|
||||
## Blog References
|
||||
|
||||
- [HunyuanVideo Native Support](../blog/hunyuanvideo-native-support.md) — 13B parameter video model, dual-stream transformer, MLLM text encoder
|
||||
- [HunyuanVideo 1.5 Native Support](../blog/hunyuanvideo-15-native-support.md) — Lightweight 8.3B model, 720p output, runs on 24GB consumer GPUs
|
||||
- [Hunyuan3D 2.0 and MultiView Native Support](../blog/hunyuan3d-20-native-support.md) — 3D model generation with PBR materials, 1.1B parameter multi-view model
|
||||
1
apps/hub/knowledge/models/hunyuan.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
Hunyuan is Tencent's open-source generative model family spanning text-to-image, text-to-video, and 3D generation. Hunyuan-DiT is a 1.5B parameter text-to-image model with native Chinese and English support and strong CJK text rendering at 1024x1024 (11-16GB VRAM). HunyuanVideo is the largest open-source video model at 13B+ parameters, generating 720p video up to 129 frames (~5s at 24fps) using a dual-stream transformer with MLLM text encoder; it requires 45-80GB VRAM depending on resolution (FP8 saves ~10GB). Hunyuan3D 2.0 handles image-to-3D and text-to-3D generation via a two-stage pipeline producing textured GLB/OBJ meshes (16-24GB VRAM). Key strengths: bilingual content creation, state-of-the-art video quality surpassing Runway Gen-3, and end-to-end 3D asset creation. Typical parameters: 25-50 steps for DiT, 50 steps for video, cfg_scale 5-8.
|
||||
52
apps/hub/knowledge/models/ideogram.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Ideogram
|
||||
|
||||
Ideogram is an AI image generation platform founded by former Google Brain researchers, known for industry-leading text rendering accuracy in generated images. It achieves approximately 90% text rendering accuracy compared to roughly 30% for competing tools.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Ideogram 3.0
|
||||
|
||||
- Latest generation released March 2025
|
||||
- Highest ELO rating in human evaluations across diverse prompts
|
||||
- Style References support with up to 3 reference images
|
||||
- Random style feature with 4.3 billion style presets
|
||||
- Batch generation for scaled content production
|
||||
|
||||
### Ideogram 2.0
|
||||
|
||||
- Previous generation model
|
||||
- Available as alternative option in the platform
|
||||
- Solid text rendering and general image quality
|
||||
|
||||
## Key Features
|
||||
|
||||
- Best-in-class text rendering with accurate typography and spelling
|
||||
- Handles complex, multi-line text compositions and curved surfaces
|
||||
- Style modes: Realistic, Anime, 3D, Watercolor, Typography
|
||||
- Magic Prompt for automatic prompt enhancement
|
||||
- Canvas editing for post-generation refinement
|
||||
- Upscaler up to 8K resolution in 2x increments
|
||||
- Color palette control for brand consistency
|
||||
- API available for programmatic integration
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API only (no local GPU required)
|
||||
- API pricing at approximately $0.06 per image
|
||||
- Web interface with credit-based subscription plans
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Marketing materials with branded text and logos
|
||||
- Social media graphics with text overlays
|
||||
- Product packaging and label design
|
||||
- Event posters, flyers, and invitations
|
||||
- Book covers and editorial design
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description with quoted text for typography
|
||||
- **model**: Version selection (2.0 or 3.0)
|
||||
- **style**: Realistic, Anime, 3D, Watercolor, Typography
|
||||
- **aspect_ratio**: 16 aspect ratio options available
|
||||
- **magic_prompt**: Toggle for automatic prompt enhancement
|
||||
51
apps/hub/knowledge/models/kandinsky.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Kandinsky
|
||||
|
||||
Kandinsky is a family of open-source diffusion models for video and image generation, developed by Kandinsky Lab (Sber AI, Russia). The models support both English and Russian text prompts.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Kandinsky 5.0 Video Pro (19B)
|
||||
|
||||
- HD video at 1280x768, 24fps (5 or 10 seconds)
|
||||
- Controllable camera motion via LoRA
|
||||
- Top-1 open-source T2V model on LMArena
|
||||
|
||||
### Kandinsky 5.0 Video Lite (2B)
|
||||
|
||||
- Lightweight model, #1 among open-source in its class
|
||||
- CFG-distilled (2x faster) and diffusion-distilled (6x faster) variants
|
||||
- Best Russian concept understanding in open source
|
||||
|
||||
### Kandinsky 5.0 Image Lite (6B)
|
||||
|
||||
- HD image output (1280x768, 1024x1024)
|
||||
- Strong text rendering; image editing variant available
|
||||
|
||||
## Key Features
|
||||
|
||||
- Bilingual support (English and Russian prompts)
|
||||
- Flow Matching architecture with MIT license
|
||||
- Camera control via trained LoRAs
|
||||
- ComfyUI and Diffusers integration
|
||||
- MagCache acceleration for faster inference
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Video Lite: 12GB VRAM minimum with optimizations
|
||||
- Video Pro: 24GB+ VRAM recommended
|
||||
- NF4 quantization and FlashAttention 2/3 or SDPA supported
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Open-source video generation research
|
||||
- Russian and English bilingual content creation
|
||||
- Camera-controlled video synthesis
|
||||
- Image generation with text rendering
|
||||
- Fine-tuning with custom LoRAs
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description in English or Russian
|
||||
- **num_frames**: Number of output frames (5s or 10s)
|
||||
- **resolution**: Output resolution (up to 1280x768)
|
||||
- **steps**: Inference steps (varies by distillation level)
|
||||
64
apps/hub/knowledge/models/kling.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# Kling
|
||||
|
||||
Kling is a video and image generation platform developed by Kuaishou Technology. It offers text-to-video, image-to-video, video editing, audio generation, and virtual try-on capabilities through both a creative studio and a developer API.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Kling O1
|
||||
|
||||
- First unified multimodal video model combining generation and editing
|
||||
- Built on Multimodal Visual Language (MVL) framework
|
||||
- Accepts text, image, video, and subject inputs in a single prompt
|
||||
- Supports video inpainting, outpainting, style re-rendering, and shot extension
|
||||
- Character and scene consistency via Element Library with director-like memory
|
||||
- Generates 3-10 second videos at up to 2K resolution
|
||||
|
||||
### Kling 2.6
|
||||
|
||||
- Simultaneous audio-visual generation in a single pass
|
||||
- Produces video with speech, sound effects, and ambient sounds together
|
||||
- Supports Chinese and English voice generation
|
||||
- Video content up to 10 seconds with synchronized audio
|
||||
- Deep semantic alignment between audio and visual dynamics
|
||||
|
||||
### Kling (Base Models)
|
||||
|
||||
- Text-to-video and image-to-video with Standard and Professional modes
|
||||
- Multi-image-to-video with multiple reference inputs
|
||||
- Camera control with 6 basic movements and 4 master shots
|
||||
- Video extension, lip-sync, and avatar generation
|
||||
- Start and end frame generation for controlled transitions
|
||||
|
||||
## Key Features
|
||||
|
||||
- Unified generation and editing in a single model (O1)
|
||||
- Simultaneous audio-visual generation (2.6)
|
||||
- Multi-subject consistency across shots and angles
|
||||
- Conversational editing via natural language prompts
|
||||
- Video effects center for special effects and transformations
|
||||
- Virtual try-on and image recognition capabilities
|
||||
- DeepSeek integration for prompt optimization
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API only; no local hardware required
|
||||
- Accessed via klingai.com creative studio or API platform
|
||||
- Standard and Professional generation modes (speed vs. quality tradeoff)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Film and television pre-production and shot generation
|
||||
- Social media content creation with audio
|
||||
- E-commerce product videos and virtual try-on
|
||||
- Advertising with one-click ad generation
|
||||
- Video post-production editing via text prompts
|
||||
- Multi-character narrative video creation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description with positive and negative prompts
|
||||
- **mode**: Standard (fast) or Professional (high quality)
|
||||
- **duration**: Video length (3-10 seconds for O1, up to 10s for 2.6)
|
||||
- **aspect_ratio**: Width-to-height ratio for output
|
||||
- **camera_control**: Predefined camera movements and master shots
|
||||
- **creativity_strength**: Balance between reference fidelity and creative variation
|
||||
68
apps/hub/knowledge/models/ltx-video.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# LTX-Video
|
||||
|
||||
LTX-Video is Lightricks' open-source DiT-based video generation model, the first capable of generating high-quality videos in real-time.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### LTX-Video 2 (v0.9.7/v0.9.8)
|
||||
|
||||
- Major quality upgrade over the original release
|
||||
- Available in 2B and 13B parameter sizes
|
||||
- 13B dev: highest quality, requires more VRAM
|
||||
- 13B distilled: faster inference, fewer steps needed, slight quality trade-off
|
||||
- 2B distilled: lightweight option for lower VRAM usage
|
||||
- FP8 quantized versions available for all sizes (13B-dev, 13B-distilled, 2B-distilled)
|
||||
- Multi-condition generation: condition on multiple images or video segments at specific frames
|
||||
- Spatial and temporal upscaler models for enhanced resolution and frame rate
|
||||
- ICLoRA adapters for depth, pose, and canny edge conditioning
|
||||
- 9 workflow templates available
|
||||
|
||||
### LTX-Video 0.9.1/0.9.6
|
||||
|
||||
- Original public releases with 2B parameter DiT architecture
|
||||
- Text-to-video and image-to-video modes
|
||||
- 768x512 native resolution at 24fps
|
||||
- 0.9.6 distilled variant: 15x faster, real-time capable, no CFG required
|
||||
- Foundation for community fine-tunes
|
||||
|
||||
## Key Features
|
||||
|
||||
- Real-time video generation on high-end GPUs (first DiT model to achieve this)
|
||||
- Generates 30 FPS video at 1216x704 resolution faster than playback speed
|
||||
- Multi-condition generation with per-frame image/video conditioning and strength control
|
||||
- Temporal VAE for smooth, consistent motion
|
||||
- Multi-scale rendering pipeline mixing dev and distilled models for speed-quality balance
|
||||
- Latent upsampling pipeline for progressive resolution enhancement
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- 2B model: 12GB VRAM minimum, 16GB recommended
|
||||
- 2B distilled FP8: 8-10GB VRAM
|
||||
- 13B model: 24-32GB VRAM (fp16)
|
||||
- 13B FP8: 16-20GB VRAM
|
||||
- 13B distilled: less VRAM than 13B dev, ideal for rapid iterations
|
||||
- 32GB+ system RAM recommended for all variants
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Short-form video content and social media clips
|
||||
- Image-to-video animation from reference frames
|
||||
- Video-to-video transformation and extension
|
||||
- Multi-condition video generation (start/end frame, keyframes)
|
||||
- Depth, pose, and edge-conditioned video generation via ICLoRA
|
||||
- Rapid video prototyping and creative experimentation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **num_frames**: Output frame count (divisible by 8 + 1, e.g. 97, 161, 257)
|
||||
- **steps**: 30-50 for dev models, 8-15 for distilled variants
|
||||
- **cfg_scale**: 3-5 typical for dev, not required for distilled
|
||||
- **width/height**: Divisible by 32, best under 720x1280 for 13B
|
||||
- **denoise_strength**: 0.3-0.5 when using latent upsampler refinement pass
|
||||
- **conditioning_strength**: Per-condition strength for multi-condition generation (default 1.0)
|
||||
- **seed**: For reproducible generation
|
||||
|
||||
## Blog References
|
||||
|
||||
- [LTX-Video 0.9.5 Day-1 Support](../blog/ltx-video-095-support.md) — Commercial license (OpenRail-M), multi-frame control, improved quality
|
||||
- [LTX-2: Open Source Audio-Video AI](../blog/ltx-2-open-source-audio-video.md) — Synchronized audio-video generation, NVFP4 for 3x speed / 60% less VRAM
|
||||
1
apps/hub/knowledge/models/ltx-video.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
LTX-Video is Lightricks' open-source DiT-based video generation model, the first to achieve real-time video generation. LTX-Video 2 (v0.9.7/0.9.8) is available in 2B and 13B parameter sizes, with dev, distilled, and FP8 quantized variants. It supports multi-condition generation with per-frame image/video conditioning, spatial and temporal upscalers, and ICLoRA adapters for depth, pose, and canny conditioning. The 2B model needs 12-16GB VRAM (8-10GB FP8), while the 13B model requires 24-32GB (16-20GB FP8). It generates 30fps video at 1216x704 faster than playback speed. Earlier versions (0.9.1/0.9.6) established the 2B foundation with a 15x faster distilled variant. Primary uses: short-form video, image-to-video animation, video extension, and multi-condition keyframe generation. Key parameters: 30-50 steps for dev, 8-15 for distilled, cfg_scale 3-5, frames divisible by 8+1.
|
||||
50
apps/hub/knowledge/models/luma.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Luma
|
||||
|
||||
Luma AI develops video and image generation models through its Dream Machine platform, powered by the Ray model family and Photon image model.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Ray3 / Ray3.14
|
||||
|
||||
- Native 1080p video with reasoning-driven generation
|
||||
- World's first native 16-bit HDR video generation
|
||||
- Character reference, Modify Video, and Draft Mode (5x faster)
|
||||
|
||||
### Ray2
|
||||
|
||||
- Production-ready text-to-video and image-to-video
|
||||
- 5-9 second output at 24fps with coherent motion
|
||||
|
||||
### Photon
|
||||
|
||||
- Image generation with strong prompt following
|
||||
- Character and visual reference support
|
||||
- 1080p output at $0.016 per image
|
||||
|
||||
## Key Features
|
||||
|
||||
- Reasoning capability for understanding creative intent
|
||||
- Visual annotation for precise layout and motion control
|
||||
- HDR generation with 16-bit EXR export for pro workflows
|
||||
- Keyframe control, video extension, looping, and camera control
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- API-only access via Luma AI API
|
||||
- No local hardware requirements
|
||||
- Available through Dream Machine web and iOS app
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Cinematic video production and storytelling
|
||||
- Commercial advertising and product videos
|
||||
- Visual effects with Modify Video workflows
|
||||
- HDR content for professional post-production
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description for video generation
|
||||
- **keyframes**: Start and/or end frame images
|
||||
- **aspect_ratio**: Output dimensions and ratio
|
||||
- **loop**: Enable seamless looping
|
||||
- **camera_control**: Camera movement via text instructions
|
||||
47
apps/hub/knowledge/models/magnific.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Magnific
|
||||
|
||||
Magnific is an AI-powered image upscaler and enhancer that uses generative AI to hallucinate new details and textures during the upscaling process.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Magnific Creative Upscaler
|
||||
|
||||
- Generative upscaling up to 16x (max 10,000px per dimension)
|
||||
- AI engines: Illusio (illustration), Sharpy (photography), Sparkle (balanced)
|
||||
- Adds hallucinated details guided by text prompts
|
||||
|
||||
### Magnific Precision Upscaler
|
||||
|
||||
- Faithful high-fidelity upscaling without creative reinterpretation
|
||||
- Clean enlargement that stays true to the source image
|
||||
|
||||
### Mystic Image Generator
|
||||
|
||||
- Photorealistic text-to-image/image-to-image with LoRA styles at up to 4K
|
||||
|
||||
## Key Features
|
||||
|
||||
- Creativity slider controls AI-hallucinated detail level
|
||||
- HDR control for micro-contrast and crispness
|
||||
- Resemblance slider to balance fidelity vs. creative enhancement
|
||||
- Optimized modes for portraits, illustrations, video games, and film
|
||||
- API hosted on Freepik with Skin Enhancer endpoint
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud-only service with no local hardware requirements
|
||||
- API available through Freepik's developer platform
|
||||
- Subscription-based with credit system
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Upscaling AI-generated images for print and production
|
||||
- Enhancing low-resolution concept art and illustrations
|
||||
- Restoring old or compressed photographs
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- Creativity: level of new detail hallucination (0-10)
|
||||
- HDR: micro-contrast and sharpness (-10 to 10)
|
||||
- Resemblance: fidelity to source image (-10 to 10)
|
||||
- Scale Factor: 2x, 4x, 8x, or 16x magnification
|
||||
49
apps/hub/knowledge/models/meshy.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# Meshy
|
||||
|
||||
Meshy is a popular AI 3D model generator enabling text-to-3D and image-to-3D creation with PBR textures and production-ready exports.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Meshy-6
|
||||
|
||||
- Latest generation with highest quality geometry
|
||||
- Supports symmetry and pose control (A-pose, T-pose)
|
||||
- Configurable polygon counts up to 300,000
|
||||
|
||||
### Meshy-5
|
||||
|
||||
- Previous generation with art style support
|
||||
- Realistic and sculpture style options
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-3D with two-stage workflow (preview mesh, then refine textures)
|
||||
- Image-to-3D from photos, sketches, or illustrations
|
||||
- Multi-image input for multi-view reconstruction
|
||||
- AI texturing with PBR maps (diffuse, roughness, metallic, normal)
|
||||
- Automatic rigging and 500+ animation motion library
|
||||
- Smart remesh with quad or triangle topology control
|
||||
- Export in FBX, GLB, OBJ, STL, 3MF, USDZ, BLEND formats
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- All generation runs on Meshy servers
|
||||
- API available on Pro tier and above
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Game development asset creation
|
||||
- 3D printing and prototyping
|
||||
- Film and VFX previsualization
|
||||
- VR/AR content development
|
||||
- Product design and e-commerce
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description up to 600 characters
|
||||
- **ai_model**: Model version (meshy-5, meshy-6, latest)
|
||||
- **topology**: Mesh type (quad or triangle)
|
||||
- **target_polycount**: 100 to 300,000 polygons
|
||||
- **enable_pbr**: Generate PBR material maps
|
||||
- **pose_mode**: Character pose (a-pose, t-pose, or none)
|
||||
58
apps/hub/knowledge/models/minimax.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# MiniMax
|
||||
|
||||
MiniMax is a multi-modal AI company known for the Hailuo video generation models and Image-01, offering API-based video and image creation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Hailuo 2.3
|
||||
|
||||
- Latest video model with improved body movement and facial expressions
|
||||
- Supports anime, illustration, ink-wash, and game-CG styles
|
||||
- 768p or 1080p resolution, 6 or 10 second clips
|
||||
- Available in Quality and Fast variants
|
||||
|
||||
### Hailuo 2.0 (Hailuo 02)
|
||||
|
||||
- Native 1080p with Noise-aware Compute Redistribution (NCR)
|
||||
- 2.5x efficiency improvement over predecessors
|
||||
- Last-frame conditioning support
|
||||
|
||||
### Image-01
|
||||
|
||||
- Text-to-image generation with multiple output sizes
|
||||
|
||||
### T2V-01-Director
|
||||
|
||||
- Enhanced camera control with natural language commands
|
||||
- Pan, zoom, tracking shot, and shake directives
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-video and image-to-video generation
|
||||
- Up to 1080p resolution at 25fps
|
||||
- Video clips up to 10 seconds
|
||||
- Camera control with natural language commands
|
||||
- Subject consistency with reference images
|
||||
- Text-to-image generation with Image-01
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- All generation runs on MiniMax servers
|
||||
- API access via platform.minimax.io
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Social media video content creation
|
||||
- Cinematic short film production
|
||||
- Product advertising and e-commerce videos
|
||||
- Anime and illustrated content
|
||||
- Character-driven narrative scenes
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description for generation
|
||||
- **model**: Model selection (hailuo-2.3, hailuo-02, image-01)
|
||||
- **resolution**: Output resolution (768p or 1080p)
|
||||
- **duration**: Clip length (6 or 10 seconds for video)
|
||||
- **first_frame_image**: Reference image for image-to-video
|
||||
762
apps/hub/knowledge/models/model-triage.json
Normal file
@@ -0,0 +1,762 @@
|
||||
{
|
||||
"generated": "2026-02-07",
|
||||
"totalModels": 87,
|
||||
"categories": {
|
||||
"specific_model": [
|
||||
{
|
||||
"name": "Wan",
|
||||
"category": "specific_model",
|
||||
"templateCount": 36,
|
||||
"priority": 108,
|
||||
"docFile": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Nano Banana Pro",
|
||||
"category": "specific_model",
|
||||
"templateCount": 29,
|
||||
"priority": 87,
|
||||
"docFile": "nano-banana-pro",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Flux",
|
||||
"category": "specific_model",
|
||||
"templateCount": 24,
|
||||
"priority": 72,
|
||||
"docFile": "flux",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "SDXL",
|
||||
"category": "specific_model",
|
||||
"templateCount": 4,
|
||||
"priority": 12,
|
||||
"docFile": "sdxl",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "ACE-Step",
|
||||
"category": "specific_model",
|
||||
"templateCount": 7,
|
||||
"priority": 21,
|
||||
"docFile": "ace-step",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Seedance",
|
||||
"category": "specific_model",
|
||||
"templateCount": 6,
|
||||
"priority": 18,
|
||||
"docFile": "seedance",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Seedream",
|
||||
"category": "specific_model",
|
||||
"templateCount": 5,
|
||||
"priority": 15,
|
||||
"docFile": "seedream",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "HiDream",
|
||||
"category": "specific_model",
|
||||
"templateCount": 5,
|
||||
"priority": 15,
|
||||
"docFile": "hidream",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Stable Audio",
|
||||
"category": "specific_model",
|
||||
"templateCount": 4,
|
||||
"priority": 12,
|
||||
"docFile": "stable-audio",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Chatter Box",
|
||||
"category": "specific_model",
|
||||
"templateCount": 4,
|
||||
"priority": 12,
|
||||
"docFile": "chatterbox",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Z-Image-Turbo",
|
||||
"category": "specific_model",
|
||||
"templateCount": 4,
|
||||
"priority": 12,
|
||||
"docFile": "z-image",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Kandinsky",
|
||||
"category": "specific_model",
|
||||
"templateCount": 3,
|
||||
"priority": 9,
|
||||
"docFile": "kandinsky",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "OmniGen",
|
||||
"category": "specific_model",
|
||||
"templateCount": 3,
|
||||
"priority": 9,
|
||||
"docFile": "omnigen",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "SeedVR2",
|
||||
"category": "specific_model",
|
||||
"templateCount": 3,
|
||||
"priority": 9,
|
||||
"docFile": "seedvr2",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Chroma",
|
||||
"category": "specific_model",
|
||||
"templateCount": 2,
|
||||
"priority": 6,
|
||||
"docFile": "chroma",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "ChronoEdit",
|
||||
"category": "specific_model",
|
||||
"templateCount": 1,
|
||||
"priority": 3,
|
||||
"docFile": "chronoedit",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "HuMo",
|
||||
"category": "specific_model",
|
||||
"templateCount": 1,
|
||||
"priority": 3,
|
||||
"docFile": "humo",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "NewBie",
|
||||
"category": "specific_model",
|
||||
"templateCount": 1,
|
||||
"priority": 3,
|
||||
"docFile": "newbie",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Ovis-Image",
|
||||
"category": "specific_model",
|
||||
"templateCount": 1,
|
||||
"priority": 3,
|
||||
"docFile": "ovis-image",
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
],
|
||||
"provider_name": [
|
||||
{
|
||||
"name": "Google",
|
||||
"category": "provider_name",
|
||||
"templateCount": 29,
|
||||
"priority": 0,
|
||||
"mapsTo": ["gemini", "veo", "nano-banana-pro"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "BFL",
|
||||
"category": "provider_name",
|
||||
"templateCount": 28,
|
||||
"priority": 0,
|
||||
"mapsTo": ["flux"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Stability",
|
||||
"category": "provider_name",
|
||||
"templateCount": 19,
|
||||
"priority": 0,
|
||||
"mapsTo": ["sdxl", "stable-audio", "reimagine"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "ByteDance",
|
||||
"category": "provider_name",
|
||||
"templateCount": 11,
|
||||
"priority": 0,
|
||||
"mapsTo": ["seedance", "seedvr2", "seedream"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "OpenAI",
|
||||
"category": "provider_name",
|
||||
"templateCount": 11,
|
||||
"priority": 0,
|
||||
"mapsTo": ["gpt-image-1"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Lightricks",
|
||||
"category": "provider_name",
|
||||
"templateCount": 9,
|
||||
"priority": 0,
|
||||
"mapsTo": ["ltx-video"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Tencent",
|
||||
"category": "provider_name",
|
||||
"templateCount": 5,
|
||||
"priority": 0,
|
||||
"mapsTo": ["hunyuan"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Qwen",
|
||||
"category": "provider_name",
|
||||
"templateCount": 2,
|
||||
"priority": 0,
|
||||
"mapsTo": ["qwen"],
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Nvidia",
|
||||
"category": "provider_name",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": [],
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
],
|
||||
"api_only": [
|
||||
{
|
||||
"name": "Vidu",
|
||||
"category": "api_only",
|
||||
"templateCount": 10,
|
||||
"priority": 20,
|
||||
"docFile": "vidu",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Kling",
|
||||
"category": "api_only",
|
||||
"templateCount": 9,
|
||||
"priority": 18,
|
||||
"docFile": "kling",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Recraft",
|
||||
"category": "api_only",
|
||||
"templateCount": 6,
|
||||
"priority": 12,
|
||||
"docFile": "recraft",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Runway",
|
||||
"category": "api_only",
|
||||
"templateCount": 5,
|
||||
"priority": 10,
|
||||
"docFile": "runway",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Tripo",
|
||||
"category": "api_only",
|
||||
"templateCount": 5,
|
||||
"priority": 10,
|
||||
"docFile": "tripo",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "GPT-Image-1",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "gpt-image-1",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "MiniMax",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "minimax",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Grok",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "grok",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Luma",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "luma",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Moonvalley",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "moonvalley",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Topaz",
|
||||
"category": "api_only",
|
||||
"templateCount": 4,
|
||||
"priority": 8,
|
||||
"docFile": "topaz",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "PixVerse",
|
||||
"category": "api_only",
|
||||
"templateCount": 3,
|
||||
"priority": 6,
|
||||
"docFile": "pixverse",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Meshy",
|
||||
"category": "api_only",
|
||||
"templateCount": 3,
|
||||
"priority": 6,
|
||||
"docFile": "meshy",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Rodin",
|
||||
"category": "api_only",
|
||||
"templateCount": 3,
|
||||
"priority": 6,
|
||||
"docFile": "rodin",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Magnific",
|
||||
"category": "api_only",
|
||||
"templateCount": 3,
|
||||
"priority": 6,
|
||||
"docFile": "magnific",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "WaveSpeed",
|
||||
"category": "api_only",
|
||||
"templateCount": 3,
|
||||
"priority": 6,
|
||||
"docFile": "wavespeed",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "BRIA",
|
||||
"category": "api_only",
|
||||
"templateCount": 2,
|
||||
"priority": 4,
|
||||
"docFile": "bria",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Veo",
|
||||
"category": "api_only",
|
||||
"templateCount": 2,
|
||||
"priority": 4,
|
||||
"docFile": "veo",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "HitPaw",
|
||||
"category": "api_only",
|
||||
"templateCount": 2,
|
||||
"priority": 4,
|
||||
"docFile": "hitpaw",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Z-Image",
|
||||
"category": "api_only",
|
||||
"templateCount": 1,
|
||||
"priority": 2,
|
||||
"docFile": "z-image",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Anima",
|
||||
"category": "api_only",
|
||||
"templateCount": 1,
|
||||
"priority": 2,
|
||||
"docFile": "anima",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Reimagine",
|
||||
"category": "api_only",
|
||||
"templateCount": 1,
|
||||
"priority": 2,
|
||||
"docFile": "reimagine",
|
||||
"mapsTo": ["stability"],
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Ideogram",
|
||||
"category": "api_only",
|
||||
"templateCount": 1,
|
||||
"priority": 2,
|
||||
"docFile": "ideogram",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Gemini3 Pro Image Preview",
|
||||
"category": "api_only",
|
||||
"templateCount": 16,
|
||||
"priority": 32,
|
||||
"docFile": "gemini",
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
],
|
||||
"utility_model": [
|
||||
{
|
||||
"name": "SVD",
|
||||
"category": "utility_model",
|
||||
"templateCount": 1,
|
||||
"priority": 1,
|
||||
"docFile": "svd",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Real-ESRGAN",
|
||||
"category": "utility_model",
|
||||
"templateCount": 1,
|
||||
"priority": 1,
|
||||
"docFile": "real-esrgan",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Depth Anything v2",
|
||||
"category": "utility_model",
|
||||
"templateCount": 1,
|
||||
"priority": 1,
|
||||
"docFile": "depth-anything-v2",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "FlashVSR",
|
||||
"category": "utility_model",
|
||||
"templateCount": 1,
|
||||
"priority": 1,
|
||||
"docFile": "flashvsr",
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
],
|
||||
"variant": [
|
||||
{
|
||||
"name": "Wan2.1",
|
||||
"category": "variant",
|
||||
"templateCount": 21,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Wan2.2",
|
||||
"category": "variant",
|
||||
"templateCount": 15,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen-Image-Edit",
|
||||
"category": "variant",
|
||||
"templateCount": 11,
|
||||
"priority": 0,
|
||||
"mapsTo": "qwen",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "LTX-2",
|
||||
"category": "variant",
|
||||
"templateCount": 9,
|
||||
"priority": 0,
|
||||
"mapsTo": "ltx-video",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen-Image",
|
||||
"category": "variant",
|
||||
"templateCount": 7,
|
||||
"priority": 0,
|
||||
"mapsTo": "qwen",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Hunyuan3D",
|
||||
"category": "variant",
|
||||
"templateCount": 7,
|
||||
"priority": 0,
|
||||
"mapsTo": "hunyuan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Google Gemini Image",
|
||||
"category": "variant",
|
||||
"templateCount": 6,
|
||||
"priority": 0,
|
||||
"mapsTo": "gemini",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Flux.2 Klein",
|
||||
"category": "variant",
|
||||
"templateCount": 6,
|
||||
"priority": 0,
|
||||
"mapsTo": "flux",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Kling O1",
|
||||
"category": "variant",
|
||||
"templateCount": 5,
|
||||
"priority": 0,
|
||||
"mapsTo": "kling",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Vidu Q2",
|
||||
"category": "variant",
|
||||
"templateCount": 5,
|
||||
"priority": 0,
|
||||
"mapsTo": "vidu",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "SD3.5",
|
||||
"category": "variant",
|
||||
"templateCount": 4,
|
||||
"priority": 0,
|
||||
"mapsTo": "sdxl",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Google Gemini",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "gemini",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Flux.2 Dev",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "flux",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Flux.2",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "flux",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Wan2.5",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Kontext",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "flux",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Wan2.6",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Hunyuan Video",
|
||||
"category": "variant",
|
||||
"templateCount": 3,
|
||||
"priority": 0,
|
||||
"mapsTo": "hunyuan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Vidu Q3",
|
||||
"category": "variant",
|
||||
"templateCount": 2,
|
||||
"priority": 0,
|
||||
"mapsTo": "vidu",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "LTXV",
|
||||
"category": "variant",
|
||||
"templateCount": 2,
|
||||
"priority": 0,
|
||||
"mapsTo": "ltx-video",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen-Image-Layered",
|
||||
"category": "variant",
|
||||
"templateCount": 2,
|
||||
"priority": 0,
|
||||
"mapsTo": "qwen",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "SD1.5",
|
||||
"category": "variant",
|
||||
"templateCount": 2,
|
||||
"priority": 0,
|
||||
"mapsTo": "sdxl",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Gemini-2.5-Flash",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "gemini",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Qwen-Image 2512",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "qwen",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Seedream 4.0",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "seedream",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "GPT-Image-1.5",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "gpt-image-1",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Kling2.6",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "kling",
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "Wan-Move",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": true
|
||||
},
|
||||
{
|
||||
"name": "Motion Control",
|
||||
"category": "variant",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"mapsTo": "wan",
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
],
|
||||
"skip": [
|
||||
{
|
||||
"name": "None",
|
||||
"category": "skip",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"hasExistingDoc": false
|
||||
},
|
||||
{
|
||||
"name": "nano-banana",
|
||||
"category": "skip",
|
||||
"templateCount": 1,
|
||||
"priority": 0,
|
||||
"note": "Duplicate of Nano Banana Pro",
|
||||
"hasExistingDoc": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"priorityOrder": [
|
||||
"wan",
|
||||
"nano-banana-pro",
|
||||
"flux",
|
||||
"gemini",
|
||||
"ace-step",
|
||||
"vidu",
|
||||
"kling",
|
||||
"seedance",
|
||||
"seedream",
|
||||
"hidream",
|
||||
"sdxl",
|
||||
"stable-audio",
|
||||
"chatterbox",
|
||||
"z-image",
|
||||
"recraft",
|
||||
"runway",
|
||||
"tripo",
|
||||
"kandinsky",
|
||||
"omnigen",
|
||||
"seedvr2",
|
||||
"gpt-image-1",
|
||||
"minimax",
|
||||
"grok",
|
||||
"luma",
|
||||
"moonvalley",
|
||||
"topaz",
|
||||
"chroma",
|
||||
"pixverse",
|
||||
"meshy",
|
||||
"rodin",
|
||||
"magnific",
|
||||
"wavespeed",
|
||||
"bria",
|
||||
"veo",
|
||||
"hitpaw",
|
||||
"newbie",
|
||||
"ovis-image",
|
||||
"chronoedit",
|
||||
"humo",
|
||||
"anima",
|
||||
"reimagine",
|
||||
"ideogram",
|
||||
"svd",
|
||||
"real-esrgan",
|
||||
"depth-anything-v2",
|
||||
"flashvsr"
|
||||
]
|
||||
}
|
||||
53
apps/hub/knowledge/models/moonvalley.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Moonvalley (Marey)
|
||||
|
||||
Marey is Moonvalley's AI video generation model for professional filmmakers, delivering studio-grade quality and trained exclusively on licensed footage.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Marey Realism v1.5
|
||||
|
||||
- Latest production model with cinematic detail
|
||||
- 1080p resolution at 24fps, up to 5-second clips
|
||||
- Available via ComfyUI native nodes and fal.ai
|
||||
|
||||
### Marey Director Controls
|
||||
|
||||
- 3D-aware camera control from single images
|
||||
- Motion transfer from reference videos
|
||||
- Trajectory control for object path definition
|
||||
- Pose transfer and keyframing with multi-image timeline
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-video and image-to-video generation
|
||||
- Camera control with 3D scene understanding
|
||||
- Motion transfer from reference video clips
|
||||
- Trajectory control via drawn paths
|
||||
- Pose transfer for expressive character animation
|
||||
- Shot extension for seamless duration increase
|
||||
- Commercially safe (trained on licensed data only)
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- Available via Moonvalley platform, ComfyUI, and fal.ai
|
||||
- Subscription tiers starting at $14.99/month
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Professional film and commercial production
|
||||
- Cinematic B-roll generation
|
||||
- Previsualization and storyboarding
|
||||
- Music video and social media content
|
||||
- Product advertising with dynamic camera
|
||||
- Animation and character-driven storytelling
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired video scene
|
||||
- **image**: Reference image for image-to-video mode
|
||||
- **camera_control**: Camera movement specification
|
||||
- **motion_reference**: Video reference for motion transfer
|
||||
- **trajectory**: Drawn path for object movement
|
||||
- **duration**: Clip length (up to 5 seconds)
|
||||
- **resolution**: Output resolution (up to 1080p at 24fps)
|
||||
53
apps/hub/knowledge/models/nano-banana-pro.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Nano Banana Pro
|
||||
|
||||
Nano Banana Pro is Google DeepMind's flagship image generation and editing model, accessed through ComfyUI's API nodes. Internally it is the Gemini 3 Pro Image model, designed for production-ready high-fidelity visuals.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Nano Banana Pro (Gemini 3 Pro Image)
|
||||
|
||||
- State-of-the-art reasoning-powered image generation
|
||||
- Supports up to 14 reference image inputs
|
||||
- Native 4K output resolution (up to 4096x4096)
|
||||
- Complex multi-turn image generation and editing
|
||||
- Model ID: `gemini-3-pro-image-preview`
|
||||
|
||||
### Gemini 2.5 Flash Image (Nano Banana)
|
||||
|
||||
- Cost-effective alternative optimized for speed
|
||||
- Balanced price-to-performance for interactive workflows
|
||||
- Character consistency and prompt-based editing
|
||||
- Model ID: `gemini-2.5-flash-image`
|
||||
|
||||
## Key Features
|
||||
|
||||
- **World knowledge**: Generates accurate real-world images using Google Search's knowledge base
|
||||
- **Text rendering**: Clean text generation with detection and translation across 10 languages
|
||||
- **Multi-image fusion**: Blend up to 14 input images into a single coherent output
|
||||
- **Studio controls**: Adjust angles, focus, color grading in generated images
|
||||
- **Character consistency**: Maintain subject identity across multiple generations
|
||||
- **Prompt-based editing**: Targeted transformations via natural language instructions
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- No local GPU required — runs as a cloud API service
|
||||
- Accessed via ComfyUI API nodes (requires ComfyUI login and network access)
|
||||
- Available on Comfy Cloud or local ComfyUI with API node support
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- High-fidelity text-to-image generation
|
||||
- Multi-reference style transfer and image blending
|
||||
- Product visualization and mockups
|
||||
- Sketch-to-image and blueprint-to-3D visualization
|
||||
- Text rendering and translation in images
|
||||
- Iterative prompt-based image editing
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired image or edit
|
||||
- **aspect_ratio**: Supported ratios include 1:1, 3:2, 4:3, 9:16, 16:9, 21:9
|
||||
- **temperature**: 0.0-2.0 (default 1.0)
|
||||
- **topP**: 0.0-1.0 (default 0.95)
|
||||
- **max_output_tokens**: Up to 32,768 tokens per response
|
||||
- **input images**: Up to 14 reference images per prompt
|
||||
43
apps/hub/knowledge/models/newbie.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# NewBie
|
||||
|
||||
NewBie image Exp0.1 is a 3.5B parameter open-source text-to-image model built on the Next-DiT architecture, developed by the NewBie-AI community. It is specifically pretrained on high-quality anime data for detailed and visually striking anime-style image generation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### NewBie image Exp0.1
|
||||
|
||||
- 3.5B parameter DiT model based on Next-DiT architecture
|
||||
- Uses Gemma3-4B-it as primary text encoder with Jina CLIP v2 for pooled features
|
||||
- FLUX.1-dev 16-channel VAE for rich color rendering and fine texture detail
|
||||
- Supports natural language, tags, and XML structured prompts
|
||||
- Non-commercial community license (Newbie-NC-1.0) for model weights
|
||||
|
||||
## Key Features
|
||||
|
||||
- Exceptional anime and ACG (Anime, Comics, Games) style generation
|
||||
- XML structured prompting for improved attribute binding and element disentanglement
|
||||
- Strong multi-character scene generation with accurate attribute assignment
|
||||
- ComfyUI integration via dedicated custom nodes
|
||||
- LoRA training support with community trainer
|
||||
- Built on research from the Lumina architecture family
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 12GB VRAM (bfloat16 or float16)
|
||||
- Recommended: 24GB VRAM for comfortable generation
|
||||
- Requires Gemma3-4B-it and Jina CLIP v2 text encoders
|
||||
- Python 3.10, PyTorch 2.6.0+, Transformers 4.57.1+
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Anime and illustration generation
|
||||
- Character design with precise attribute control
|
||||
- Multi-character scene composition
|
||||
- Fan art and creative anime artwork
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **num_inference_steps**: 28 recommended
|
||||
- **height/width**: 1024x1024 native resolution
|
||||
- **prompt_format**: Natural language, tags, or XML structured
|
||||
- **torch_dtype**: bfloat16 recommended (float16 fallback)
|
||||
53
apps/hub/knowledge/models/omnigen.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# OmniGen2
|
||||
|
||||
OmniGen2 is a multimodal generation model with dual decoding pathways for text and image, built on the Qwen-VL-2.5 foundation by VectorSpaceLab.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### OmniGen2
|
||||
|
||||
- 3B vision-language encoder (Qwen-VL-2.5) + 4B image decoder
|
||||
- Dual decoding with unshared parameters for text and image
|
||||
- Decoupled image tokenizer
|
||||
- Apache 2.0 license
|
||||
|
||||
### OmniGen v1
|
||||
|
||||
- Earlier single-pathway architecture
|
||||
- Fewer capabilities than OmniGen2
|
||||
- Superseded by OmniGen2
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-image generation with high fidelity and aesthetics
|
||||
- Instruction-guided image editing (state-of-the-art among open-source models)
|
||||
- In-context generation combining multiple reference inputs (humans, objects, scenes)
|
||||
- Visual understanding inherited from Qwen-VL-2.5
|
||||
- CPU offload support reduces VRAM usage by nearly 50%
|
||||
- Sequential CPU offload available for under 3GB VRAM (slower inference)
|
||||
- Supports negative prompts and configurable guidance scales
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: NVIDIA RTX 3090 or equivalent (~17GB VRAM)
|
||||
- With CPU offload: ~9GB VRAM
|
||||
- With sequential CPU offload: under 3GB VRAM (significantly slower)
|
||||
- Flash Attention optional but recommended for best performance
|
||||
- CUDA 12.4+ recommended
|
||||
- Default output resolution: 1024x1024
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Text-to-image generation
|
||||
- Instruction-based photo editing
|
||||
- Subject-driven image generation from reference photos
|
||||
- Multi-image composition and in-context editing
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **text_guidance_scale**: Controls adherence to text prompt (CFG)
|
||||
- **image_guidance_scale**: Controls similarity to reference image (1.2-2.0 for editing, 2.5-3.0 for in-context)
|
||||
- **num_inference_step**: Diffusion steps (default 50)
|
||||
- **max_pixels**: Maximum total pixel count for input images (default 1024x1024)
|
||||
- **negative_prompt**: Text describing undesired qualities (e.g., "blurry, low quality, watermark")
|
||||
- **scheduler**: ODE solver choice (euler or dpmsolver++)
|
||||
43
apps/hub/knowledge/models/ovis-image.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Ovis-Image
|
||||
|
||||
Ovis-Image is a 7B text-to-image model by AIDC-AI, built on Ovis-U1, optimized for high-quality text rendering in generated images. It achieves state-of-the-art results on the CVTG-2K text rendering benchmark while remaining compact enough for single-GPU deployment.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Ovis-Image-7B
|
||||
|
||||
- 2B (Ovis2.5-2B) + 7B parameter architecture
|
||||
- State-of-the-art on CVTG-2K benchmark for text rendering accuracy
|
||||
- Competitive with 20B+ models (Qwen-Image) and GPT-4o on text-centric tasks
|
||||
- Uses FLUX-based autoencoder for latent encoding
|
||||
- Apache 2.0 license
|
||||
|
||||
## Key Features
|
||||
|
||||
- Excellent text rendering with correct spelling and consistent typography
|
||||
- High fidelity on text-heavy, layout-sensitive prompts
|
||||
- Handles posters, banners, logos, UI mockups, and infographics
|
||||
- Supports diverse fonts, sizes, and aspect ratios
|
||||
- Strong performance on both English and Chinese text generation
|
||||
- Available via Diffusers library with OvisImagePipeline
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 16GB VRAM (bfloat16)
|
||||
- Recommended: 24GB VRAM for comfortable use
|
||||
- Fits on a single high-end GPU
|
||||
- Tested with Python 3.10, PyTorch 2.6.0, Transformers 4.57.1
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Generating posters and banners with accurate text
|
||||
- Logo and brand asset creation
|
||||
- UI mockup and infographic generation
|
||||
- Marketing materials with embedded typography
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **num_inference_steps**: 50 recommended
|
||||
- **guidance_scale**: 5.0
|
||||
- **resolution**: 1024x1024 native
|
||||
- **negative_prompt**: Supported for quality control
|
||||
46
apps/hub/knowledge/models/pixverse.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# PixVerse
|
||||
|
||||
PixVerse is an AI video generation platform founded in 2023 and backed by Alibaba, offering text-to-video and image-to-video capabilities with over 100 million registered users.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### PixVerse V5.5
|
||||
|
||||
- Latest model with improved fidelity, text-to-video, image-to-video, and modification
|
||||
|
||||
### PixVerse R1
|
||||
|
||||
- Real-time AI video generation model
|
||||
- Interactive control where users direct character actions as video unfolds
|
||||
|
||||
### PixVerse V4.5 / V5
|
||||
|
||||
- Previous generation models with strong cinematic quality and trending effects
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-video generation from natural language prompts
|
||||
- Image-to-video animation with realistic physics simulation
|
||||
- Fusion mode combining up to 3 images into one video
|
||||
- Key frame control and video extension with AI continuity
|
||||
- AI Video Modify for text-prompt-based editing
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud-based platform with no local GPU required
|
||||
- Web app at app.pixverse.ai and mobile apps (iOS/Android)
|
||||
- API at platform.pixverse.ai for developer integration
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Social media content creation (AI Kiss, Hug, Dance effects)
|
||||
- Marketing and promotional video production
|
||||
- Old photo revival and animation
|
||||
- Cinematic narrative and stylistic art generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- prompt: text description of the desired video content
|
||||
- duration: video length (typically 5s clips)
|
||||
- resolution: output quality (360p to 720p+)
|
||||
- aspect_ratio: 16:9, 9:16, 1:1, and other ratios
|
||||
77
apps/hub/knowledge/models/qwen.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Qwen
|
||||
|
||||
Qwen is Alibaba's family of vision-language and image generation models, spanning visual understanding, image editing, and image generation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Qwen2.5-VL
|
||||
|
||||
- Multimodal vision-language model from the Qwen team
|
||||
- Available in 3B, 7B, and 72B parameter sizes
|
||||
- Image understanding, video comprehension (1+ hour videos), and visual localization
|
||||
- Visual agent capabilities: computer use, phone use, dynamic tool calling
|
||||
- Structured output generation for invoices, forms, and tables
|
||||
- Dynamic resolution and frame rate training for video understanding
|
||||
- Optimized ViT encoder with window attention, SwiGLU, and RMSNorm
|
||||
|
||||
### Qwen-Image-Edit
|
||||
|
||||
- Specialized image editing model with instruction-following
|
||||
- Supports inpainting, outpainting, style transfer, and content-aware edits
|
||||
- 11 workflow templates available
|
||||
|
||||
### Qwen-Image
|
||||
|
||||
- Text-to-image generation model from the Qwen family
|
||||
- 7 workflow templates available
|
||||
|
||||
### Qwen-Image-Layered
|
||||
|
||||
- Layered image generation for composable outputs
|
||||
- Generates images with separate foreground/background layers
|
||||
- 2 workflow templates available
|
||||
|
||||
### Qwen-Image 2512
|
||||
|
||||
- Specific variant optimized for particular generation tasks
|
||||
- 1 workflow template available
|
||||
|
||||
## Key Features
|
||||
|
||||
- Strong visual understanding with state-of-the-art benchmark results
|
||||
- Native multi-language support including Chinese and English
|
||||
- Visual agent capabilities for computer and phone interaction
|
||||
- Video event capture with temporal segment pinpointing
|
||||
- Bounding box and point-based visual localization
|
||||
- Structured JSON output for document and table extraction
|
||||
- Instruction-based image editing with precise control
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- 3B model: 6-8GB VRAM
|
||||
- 7B model: 16GB VRAM, flash_attention_2 recommended for multi-image/video
|
||||
- 72B model: Multi-GPU setup required (80GB+ per GPU)
|
||||
- Context length: 32,768 tokens default, extendable to 64K+ with YaRN
|
||||
- Dynamic pixel budget: 256-1280 tokens per image (configurable min/max pixels)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Image editing based on text instructions
|
||||
- Visual question answering and image description
|
||||
- Long video comprehension and event extraction
|
||||
- Document OCR and structured data extraction
|
||||
- Visual agent tasks (screen interaction, UI navigation)
|
||||
- Layered image generation for design workflows
|
||||
- Text-to-image generation with strong prompt following
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **max_new_tokens**: Controls output length for VL model responses
|
||||
- **min_pixels / max_pixels**: Control image token budget (e.g. 256x28x28 to 1280x28x28)
|
||||
- **temperature**: Generation diversity for text outputs
|
||||
- **resized_height / resized_width**: Direct image dimension control (rounded to nearest 28)
|
||||
- **fps**: Frame rate for video input processing in Qwen2.5-VL
|
||||
|
||||
## Blog References
|
||||
|
||||
- [Qwen Image Edit 2511 & Qwen Image Layered](../blog/qwen-image-edit-2511.md) — Better character consistency, RGBA layer decomposition, built-in LoRA support
|
||||
1
apps/hub/knowledge/models/qwen.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
Qwen is Alibaba's family of vision-language and image generation models. Qwen2.5-VL is a multimodal vision-language model available in 3B (6-8GB VRAM), 7B (16GB), and 72B (multi-GPU 80GB+) sizes, capable of image understanding, hour-long video comprehension, visual localization, visual agent tasks (computer/phone use), and structured JSON output for document extraction. Qwen-Image-Edit provides instruction-based image editing with inpainting, outpainting, and style transfer. Qwen-Image handles text-to-image generation, while Qwen-Image-Layered produces composable foreground/background layer outputs. The family features native Chinese/English support, strong prompt following, and state-of-the-art visual understanding benchmarks. Key parameters include dynamic pixel budgets (256-1280 tokens per image), configurable frame rates for video input, and temperature for text diversity. Primary uses: image editing, visual QA, video comprehension, document OCR, and layered image generation.
|
||||
61
apps/hub/knowledge/models/real-esrgan.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Real-ESRGAN
|
||||
|
||||
Real-ESRGAN is a practical image and video super-resolution model that extends ESRGAN with improved training on pure synthetic data for real-world restoration.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### RealESRGAN_x4plus
|
||||
|
||||
- General-purpose 4× upscaling model for real-world images
|
||||
- RRDB (Residual-in-Residual Dense Block) architecture
|
||||
- Handles noise, blur, JPEG compression artifacts
|
||||
|
||||
### RealESRGAN_x4plus_anime_6B
|
||||
|
||||
- Optimized for anime and illustration images
|
||||
- Smaller 6-block model for faster inference
|
||||
- Better edge preservation for line art
|
||||
|
||||
### RealESRGAN_x2plus
|
||||
|
||||
- 2× upscaling variant for moderate enlargement
|
||||
- Lower risk of hallucinated details
|
||||
|
||||
### realesr-animevideov3
|
||||
|
||||
- Lightweight model designed for anime video frames
|
||||
- Temporal consistency for video processing
|
||||
|
||||
## Key Features
|
||||
|
||||
- Trained entirely on synthetic degradation data (no paired real-world data needed)
|
||||
- Second-order degradation modeling simulates real-world compression chains
|
||||
- GFPGAN integration for face enhancement during upscaling
|
||||
- Tiling support for processing large images with limited VRAM
|
||||
- FP16 (half precision) inference for faster processing
|
||||
- NCNN Vulkan portable executables for cross-platform GPU support (Intel/AMD/NVIDIA)
|
||||
- Supports 2×, 3×, and 4× upscaling with arbitrary output scale via LANCZOS4 resize
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 2GB VRAM with tiling enabled
|
||||
- Recommended: 4GB+ VRAM for comfortable use
|
||||
- NCNN Vulkan build runs on any GPU with Vulkan support
|
||||
- CPU inference supported but significantly slower
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Upscaling old or low-resolution photographs
|
||||
- Enhancing compressed web images
|
||||
- Anime and manga image upscaling
|
||||
- Video frame super-resolution
|
||||
- Restoring degraded historical images
|
||||
- Pre-processing for print from low-resolution sources
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **outscale**: Final upsampling scale factor (default: 4)
|
||||
- **tile**: Tile size for memory management (0 = no tiling)
|
||||
- **face_enhance**: Enable GFPGAN face enhancement (default: false)
|
||||
- **model_name**: Select model variant (RealESRGAN_x4plus, anime_6B, etc.)
|
||||
- **denoise_strength**: Balance noise removal vs detail preservation (realesr-general-x4v3)
|
||||
50
apps/hub/knowledge/models/recraft.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Recraft
|
||||
|
||||
Recraft is an AI image generation platform known for its V3 model and unique ability to produce both raster and vector (SVG) images from text prompts.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Recraft V3
|
||||
|
||||
- Top-ranked model on Artificial Analysis Text-to-Image Leaderboard
|
||||
- Supports raster image generation at $0.04 per image
|
||||
- Supports vector SVG generation at $0.08 per image
|
||||
- Accurate text rendering at any size in generated images
|
||||
|
||||
### Recraft 20B
|
||||
|
||||
- More cost-effective variant at $0.022 per raster image
|
||||
- Vector generation at $0.044 per image
|
||||
- Suitable for high-volume production workflows
|
||||
|
||||
## Key Features
|
||||
|
||||
- Native vector SVG image generation from text prompts
|
||||
- Accurate text rendering (headlines, labels, signs) in images
|
||||
- Custom brand style creation from reference images
|
||||
- Generation in exact brand colors for brand consistency
|
||||
- AI-powered image vectorization (PNG/JPG to SVG)
|
||||
- Background removal, creative upscaling, and crisp upscaling
|
||||
- Multiple style presets: photorealism, clay, retro-pop, hand-drawn, 80s
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- API-only access via Recraft API
|
||||
- No local hardware requirements
|
||||
- Available through Recraft Studio web interface
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Logo and icon design (SVG output)
|
||||
- Brand-consistent marketing asset generation
|
||||
- Poster and advertisement creation with text
|
||||
- Scalable vector illustrations for web and print
|
||||
- Product mockup generation
|
||||
- SEO blog imagery at scale
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of the desired image
|
||||
- **style**: Visual style (realistic_image, digital_illustration, vector_illustration, icon)
|
||||
- **colors**: Brand color palette for consistent output
|
||||
- **format**: Output format (raster PNG/JPG or vector SVG)
|
||||
57
apps/hub/knowledge/models/rodin.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Rodin
|
||||
|
||||
Rodin is a 3D generation API by Hyper3D (DeemosTech) that creates production-ready 3D models from text or images with PBR materials.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Rodin Gen-2
|
||||
|
||||
- Most advanced model with 10 billion parameters
|
||||
- Built on the BANG architecture
|
||||
- 4x improved geometric mesh quality over Gen-1
|
||||
- Generation time approximately 90 seconds
|
||||
|
||||
### Rodin Gen-1.5 Regular
|
||||
|
||||
- Detailed 3D assets with customizable quality
|
||||
- Adjustable polygon counts and 2K textures
|
||||
- Generation time approximately 70 seconds
|
||||
|
||||
### Rodin Sketch
|
||||
|
||||
- Fast prototyping with basic geometry and 1K textures
|
||||
- GLB format only, generation in approximately 20 seconds
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-3D and image-to-3D generation
|
||||
- Multi-view image input (up to 5 images) with fuse and concat modes
|
||||
- PBR and Shaded material options
|
||||
- Quad and triangle mesh modes
|
||||
- HighPack add-on for 4K textures and high-poly models
|
||||
- Bounding box ControlNet for dimension constraints
|
||||
- T/A pose control for humanoid models
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- All generation runs on Hyper3D servers
|
||||
- API key required via hyper3d.ai dashboard
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Game asset production
|
||||
- VR/AR content creation
|
||||
- Product visualization
|
||||
- Character modeling with pose control
|
||||
- Rapid 3D prototyping
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description for text-to-3D mode
|
||||
- **images**: Up to 5 reference images for image-to-3D
|
||||
- **quality**: Detail level (high, medium, low, extra-low)
|
||||
- **mesh_mode**: Face type (Quad or Raw triangles)
|
||||
- **material**: Material type (PBR, Shaded, or All)
|
||||
- **geometry_file_format**: Output format (glb, fbx, obj, stl, usdz)
|
||||
- **seed**: Randomization seed (0-65535)
|
||||
50
apps/hub/knowledge/models/runway.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Runway
|
||||
|
||||
Runway is a generative AI company producing state-of-the-art video generation models, accessible via API and web interface.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Gen-3 Alpha
|
||||
|
||||
- Text-to-video and image-to-video at 1280x768, 24fps
|
||||
- 5 or 10 second output, extendable up to 40 seconds
|
||||
- Photorealistic human character generation
|
||||
|
||||
### Gen-3 Alpha Turbo
|
||||
|
||||
- Faster, lower-cost variant (5 credits/sec vs 10)
|
||||
- Requires input image; supports first, middle, and last keyframes
|
||||
- Video extension up to 34 seconds total
|
||||
|
||||
### Gen-4 Turbo
|
||||
|
||||
- Latest generation with improved motion and prompt adherence
|
||||
- Image reference support and text-to-image (gen4_image)
|
||||
|
||||
## Key Features
|
||||
|
||||
- Advanced camera controls (Motion Brush, Director Mode)
|
||||
- C2PA provenance metadata for content authenticity
|
||||
- Expressive human characters with gestures and emotions
|
||||
- Wide range of cinematic styles and terminology support
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- API-only access via Runway developer portal
|
||||
- No local hardware requirements
|
||||
- Enterprise tier available for higher rate limits
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Film pre-visualization and storyboarding
|
||||
- Commercial advertisement production
|
||||
- Social media video content
|
||||
- Visual effects and motion graphics
|
||||
- Music video and artistic video creation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description guiding video generation
|
||||
- **duration**: Output length (5 or 10 seconds)
|
||||
- **ratio**: Aspect ratio (1280:768 or 768:1280)
|
||||
- **keyframes**: Start, middle, and/or end frame images
|
||||
63
apps/hub/knowledge/models/sd3-5.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Stable Diffusion 3.5
|
||||
|
||||
Stable Diffusion 3.5 is Stability AI's text-to-image model family based on the Multimodal Diffusion Transformer (MMDiT) architecture with rectified flow matching.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Stable Diffusion 3.5 Large
|
||||
|
||||
- 8.1 billion parameter MMDiT model
|
||||
- Highest quality and prompt adherence in the SD family
|
||||
- 1 megapixel native resolution (1024×1024)
|
||||
- 28-50 inference steps recommended
|
||||
|
||||
### Stable Diffusion 3.5 Large Turbo
|
||||
|
||||
- Distilled version of SD 3.5 Large
|
||||
- 4-step inference for fast generation
|
||||
- Guidance scale of 0 (classifier-free guidance disabled)
|
||||
- Comparable quality to full model at fraction of the time
|
||||
|
||||
### Stable Diffusion 3.5 Medium
|
||||
|
||||
- 2.5 billion parameter MMDiT-X architecture
|
||||
- Designed for consumer hardware (9.9GB VRAM for transformer)
|
||||
- Dual attention blocks in first 12 transformer layers
|
||||
- Multi-resolution generation from 0.25 to 2 megapixels
|
||||
- Skip Layer Guidance recommended for better coherency
|
||||
|
||||
## Key Features
|
||||
|
||||
- Three text encoders: CLIP ViT-L, OpenCLIP ViT-bigG (77 tokens each), T5-XXL (256 tokens)
|
||||
- QK-normalization for stable training and easier fine-tuning
|
||||
- Rectified flow matching replaces traditional DDPM/DDIM sampling
|
||||
- Strong text rendering and typography in generated images
|
||||
- Diverse output styles (photography, 3D, painting, line art)
|
||||
- Highly customizable base for fine-tuning and LoRA training
|
||||
- T5-XXL encoder optional (can be removed to save memory with minimal quality loss)
|
||||
- Supports negative prompts for excluding unwanted elements
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Large: 24GB+ VRAM recommended (fp16), quantizable to fit smaller GPUs
|
||||
- Large Turbo: 16GB+ VRAM recommended
|
||||
- Medium: 10GB VRAM minimum (excluding text encoders)
|
||||
- NF4 quantization available via bitsandbytes for low-VRAM GPUs
|
||||
- CPU offloading supported via diffusers pipeline
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Photorealistic image generation
|
||||
- Artistic illustration and concept art
|
||||
- Typography and text-heavy designs
|
||||
- Product visualization
|
||||
- Fine-tuning and LoRA development
|
||||
- ControlNet-guided generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: 28-50 for Large, 4 for Large Turbo, 20-40 for Medium
|
||||
- **guidance_scale**: 4.5-7.5 for Large/Medium, 0 for Large Turbo
|
||||
- **max_sequence_length**: T5 token limit (77 or 256, higher = better prompt understanding)
|
||||
- **resolution**: 1024×1024 native, flexible aspect ratios around 1MP
|
||||
- **negative_prompt**: Text describing elements to exclude (not supported by Turbo)
|
||||
75
apps/hub/knowledge/models/sdxl.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Stable Diffusion
|
||||
|
||||
Stable Diffusion is Stability AI's family of open-source image and video generation models, spanning multiple architectures from U-Net to diffusion transformers.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### SDXL (Stable Diffusion XL)
|
||||
|
||||
- Stability AI's flagship text-to-image model (6.6B parameter U-Net)
|
||||
- Native 1024x1024 resolution with flexible aspect ratios around 1MP
|
||||
- Two text encoders (CLIP ViT-L + OpenCLIP ViT-bigG)
|
||||
- Optional refiner model for second-stage detail enhancement
|
||||
- Turbo and Lightning distilled variants for 1-4 step generation
|
||||
- Largest ecosystem of LoRAs, fine-tunes, and community models
|
||||
|
||||
### SD3.5 (Stable Diffusion 3.5)
|
||||
|
||||
- Diffusion transformer (DiT) architecture, successor to SDXL
|
||||
- Three text encoders (CLIP ViT-L, OpenCLIP ViT-bigG, T5-XXL) for stronger prompt following
|
||||
- Available in Large (8B) and Medium (2B) parameter sizes
|
||||
- Improved text rendering and compositional accuracy over SDXL
|
||||
- 4 workflow templates available
|
||||
|
||||
### SD1.5 (Stable Diffusion 1.5)
|
||||
|
||||
- The classic 512x512 latent diffusion model
|
||||
- Single CLIP ViT-L text encoder, 860M parameter U-Net
|
||||
- Still widely used for its massive LoRA and checkpoint ecosystem
|
||||
- Lower VRAM requirements make it accessible on consumer hardware
|
||||
- 2 workflow templates available
|
||||
|
||||
### SVD (Stable Video Diffusion)
|
||||
|
||||
- Image-to-video generation model based on Stable Diffusion
|
||||
- Generates short video clips (14 or 25 frames) from a single image
|
||||
- Related model for motion generation from static inputs
|
||||
|
||||
### Stability API Products
|
||||
|
||||
- Reimagine: Stability's API-based image variation and transformation service
|
||||
|
||||
## Key Features
|
||||
|
||||
- Excellent composition, layout, and photorealism (SDXL/SD3.5)
|
||||
- Large open-source ecosystem with thousands of community fine-tunes
|
||||
- Flexible aspect ratios and multi-resolution support
|
||||
- Dual/triple CLIP text encoding for nuanced prompt interpretation
|
||||
- Strong text rendering in SD3.5 via T5-XXL encoder
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- SD1.5: 4-6GB VRAM (fp16), runs on most consumer GPUs
|
||||
- SDXL Base: 8GB VRAM minimum (fp16), 12GB recommended
|
||||
- SDXL Base + Refiner: 16GB+ VRAM
|
||||
- SD3.5 Medium: 8-12GB VRAM
|
||||
- SD3.5 Large: 16-24GB VRAM (fp16), quantized versions for 12GB cards
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Photorealistic image generation
|
||||
- Artistic illustrations and concept art
|
||||
- Product photography and design
|
||||
- Character and portrait generation
|
||||
- LoRA-based custom style and subject training
|
||||
- Image-to-video with SVD
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: 20-40 for SDXL base, 15-25 for refiner, 28+ for SD3.5
|
||||
- **cfg_scale**: 5-10 (7 default for SDXL), 3.5-7 for SD3.5
|
||||
- **sampler**: DPM++ 2M Karras and Euler are popular for SDXL; Euler for SD3.5
|
||||
- **resolution**: 1024x1024 native for SDXL/SD3.5, 512x512 for SD1.5
|
||||
- **clip_skip**: Often set to 1-2; important for SD1.5 LoRA compatibility
|
||||
- **denoise_strength**: 0.7-0.8 when using the SDXL refiner (img2img)
|
||||
- **negative_prompt**: Supported in SDXL/SD1.5; not used in SD3.5 by default
|
||||
1
apps/hub/knowledge/models/sdxl.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
Stable Diffusion is Stability AI's open-source image and video generation family. SDXL is the flagship text-to-image model (6.6B U-Net, dual CLIP encoders) generating 1024x1024 images with the largest ecosystem of LoRAs and community fine-tunes; it requires 8-12GB VRAM with Turbo/Lightning variants for 1-4 step generation. SD3.5 is the DiT-based successor with triple text encoders (including T5-XXL) in Large (8B, 16-24GB) and Medium (2B, 8-12GB) sizes, offering improved text rendering and compositional accuracy. SD1.5 remains popular for its massive ecosystem at just 4-6GB VRAM (512x512). SVD handles image-to-video generation (14 or 25 frames). Key parameters: 20-40 steps for SDXL, cfg_scale 5-10 (7 default), DPM++ 2M Karras sampler. Primary uses: photorealistic generation, artistic illustration, product photography, character generation, and LoRA-based custom training.
|
||||
64
apps/hub/knowledge/models/seedance.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# Seedance
|
||||
|
||||
Seedance is ByteDance's video generation model family, designed for cinematic, high-fidelity video creation from text and images. The 1.0 series established a standard for fluid motion and multi-shot consistency, while the 1.5 series adds native joint audio-visual generation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Seedance 1.5 Pro
|
||||
|
||||
- Native audio-visual generation producing video and audio in a single pass
|
||||
- Multilingual lip-sync supporting English, Mandarin, Japanese, Korean, and Spanish
|
||||
- 1080p output with 5-12 second duration
|
||||
- Advanced directorial camera controls (dolly zoom, tracking shots, whip pans)
|
||||
- Captures micro-expressions, non-verbal cues, and emotional transitions
|
||||
|
||||
### Seedance 1.0 Pro
|
||||
|
||||
- Production-quality 1080p video generation
|
||||
- Text-to-video and image-to-video with first and last frame control
|
||||
- Native multi-shot storytelling with subject and style consistency across cuts
|
||||
- Cinematic camera grammar interpretation (35mm film, noir lighting, drone shots)
|
||||
- 2-12 second video duration at 24-30fps
|
||||
|
||||
### Seedance 1.0 Pro Fast
|
||||
|
||||
- Faster, more cost-effective version of 1.0 Pro
|
||||
- Same capabilities with reduced generation time
|
||||
|
||||
### Seedance 1.0 Lite
|
||||
|
||||
- Optimized for speed and iteration at 720p or 1080p
|
||||
- Lower cost per generation for rapid prototyping
|
||||
|
||||
## Key Features
|
||||
|
||||
- Smooth, stable motion with wide dynamic range for large-scale movements
|
||||
- Native multi-shot storytelling maintaining consistency across transitions
|
||||
- Diverse stylistic expression (photorealism, cyberpunk, illustration, pixel art)
|
||||
- Precise prompt following for complex actions, multi-agent interactions, and camera work
|
||||
- Joint audio-visual synthesis with environmental sounds and dialogue (1.5)
|
||||
- Supports multiple aspect ratios (16:9, 9:16, 1:1, 4:3, 21:9, and more)
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API only; no local weights publicly available
|
||||
- Accessed via seed.bytedance.com, Scenario, fal.ai, and other API providers
|
||||
- 1080p 5-second video costs approximately $0.62 via fal.ai (Pro)
|
||||
- Lite version available at lower cost ($0.18 per 720p 5-second video)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Cinematic shorts and scene previsualization
|
||||
- Music video concept development
|
||||
- Product demonstration and marketing videos
|
||||
- Character-focused animation sequences
|
||||
- Social media content with audio (1.5)
|
||||
- Moodboard and style exploration for creative teams
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired scene, action, and camera work
|
||||
- **image_url**: Source image for image-to-video generation (first frame)
|
||||
- **duration**: Video length (2-12 seconds for 1.0, 5-12 seconds for 1.5)
|
||||
- **resolution**: 480p, 720p, or 1080p output
|
||||
- **aspect_ratio**: 16:9, 9:16, 1:1, 4:3, 21:9, 9:21
|
||||
50
apps/hub/knowledge/models/seedream.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Seedream
|
||||
|
||||
Seedream is ByteDance's text-to-image generation model, capable of producing high-quality images with strong text rendering, bilingual support (English and Chinese), and native high-resolution output.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Seedream 3.0
|
||||
|
||||
- Native 2K resolution output without post-processing
|
||||
- Bilingual image generation (English and Chinese)
|
||||
- 3-second end-to-end generation for 1K images
|
||||
- Improved text rendering for small fonts and long text layouts
|
||||
|
||||
### Seedream 4.0
|
||||
|
||||
- Unified architecture for text-to-image and image editing
|
||||
- Native output up to 4K resolution
|
||||
- Multi-image reference input (up to 6 source images)
|
||||
- 1.8-second inference for 2K images
|
||||
- Batch input and output for multiple generations
|
||||
- Natural language image editing capabilities
|
||||
|
||||
## Key Features
|
||||
|
||||
- Accurate text rendering in both English and Chinese
|
||||
- Knowledge-driven generation for educational illustrations and charts
|
||||
- Strong character consistency across multiple angles
|
||||
- Prompt-based image editing without separate tools
|
||||
- Versatile style support from photorealism to anime
|
||||
- Leading scores on Artificial Analysis Image Arena
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- API-only access via ByteDance Volcano Engine
|
||||
- No local hardware requirements for end users
|
||||
- Third-party API providers available (e.g., EvoLink)
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Poster and advertisement design with embedded text
|
||||
- E-commerce product photography
|
||||
- Character design with multi-angle consistency
|
||||
- Educational illustration and infographic generation
|
||||
- Brand-consistent marketing materials
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of the desired image
|
||||
- **resolution**: Output resolution (up to 4K supported)
|
||||
- **aspect_ratio**: Supports 16:9, 4:3, 1:1, and custom ratios
|
||||
47
apps/hub/knowledge/models/seedvr2.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# SeedVR2
|
||||
|
||||
SeedVR2 is a one-step diffusion-based video restoration model developed by ByteDance Seed and NTU S-Lab, published at ICLR 2026.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### SeedVR2-3B
|
||||
|
||||
- 3B parameter DiT with one-step inference for video and image upscaling
|
||||
- Available in FP16, FP8, and GGUF quantized formats
|
||||
|
||||
### SeedVR2-7B
|
||||
|
||||
- 7B parameter model with Sharp variant for maximum detail
|
||||
- Multi-GPU inference; supports 1080p and 2K on 4x H100-80GB
|
||||
|
||||
### SeedVR (Original)
|
||||
|
||||
- Multi-step diffusion model (CVPR 2025 Highlight)
|
||||
- Arbitrary-resolution restoration without pretrained diffusion prior
|
||||
|
||||
## Key Features
|
||||
|
||||
- One-step inference achieving 10x speedup over multi-step methods
|
||||
- Adaptive window attention with dynamic sizing for high-resolution inputs
|
||||
- Adversarial post-training against real data for faithful detail recovery
|
||||
- ComfyUI integration via official SeedVR2 Video Upscaler nodes
|
||||
- Apache 2.0 open-source license
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 8-12GB VRAM with GGUF quantization and tiled VAE
|
||||
- Recommended: 24GB+ VRAM (RTX 4090) for 3B model at 1080p
|
||||
- High-end: 4x H100-80GB for 7B model at 2K resolution
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Upscaling AI-generated video to 1080p or 4K
|
||||
- Restoring degraded or compressed video footage
|
||||
- Image super-resolution and detail recovery
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- resolution: target shortest-edge resolution (720, 1080, 2160)
|
||||
- batch_size: frames per batch, must follow 4n+1 formula (5, 9, 13, 17, 21)
|
||||
- seed: random seed for reproducible generation
|
||||
- color_fix_type: wavelet, adain, hsv, or none
|
||||
53
apps/hub/knowledge/models/stable-audio.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Stable Audio Open
|
||||
|
||||
Stable Audio Open 1.0 is Stability AI's open-source text-to-audio model for generating sound effects, production elements, and short musical clips.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Stable Audio Open 1.0
|
||||
|
||||
- 1.2B parameter latent diffusion model
|
||||
- Transformer-based diffusion (DiT) architecture
|
||||
- T5-base text encoder for conditioning
|
||||
- Variational autoencoder for audio compression
|
||||
- Stability AI Community License (non-commercial)
|
||||
|
||||
### Stable Audio (Commercial)
|
||||
|
||||
- Full-length music generation up to 3 minutes with audio-to-audio and inpainting
|
||||
- Available via Stability AI platform API, commercial license
|
||||
|
||||
## Key Features
|
||||
|
||||
- Generates up to 47 seconds of stereo audio at 44.1kHz
|
||||
- Text-prompted sound effects, drum beats, ambient sounds, and foley
|
||||
- Variable-length output with timing control
|
||||
- Fine-tunable on custom audio datasets
|
||||
- Trained exclusively on Creative Commons licensed audio (CC0, CC BY, CC Sampling+)
|
||||
- Strong performance for sound effects and field recordings
|
||||
- Compatible with both stable-audio-tools and diffusers libraries
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 8GB VRAM (fp16)
|
||||
- Recommended: 12GB+ VRAM for comfortable inference
|
||||
- Half-precision (fp16) supported for reduced memory
|
||||
- Chunked decoding available for memory-constrained setups
|
||||
- Inference speed: 8-20 diffusion steps per second depending on GPU
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Sound effect and foley generation
|
||||
- Drum beats and instrument riff creation
|
||||
- Ambient soundscapes and background audio
|
||||
- Music production elements and samples
|
||||
- Audio prototyping for film and game sound design
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **steps**: Number of inference steps (100-200 recommended)
|
||||
- **cfg_scale**: Classifier-free guidance scale (typically 7)
|
||||
- **seconds_total**: Target audio duration (up to 47 seconds)
|
||||
- **seconds_start**: Start time offset for timing control
|
||||
- **negative_prompt**: Text describing undesired audio qualities
|
||||
- **sampler_type**: Diffusion sampler (dpmpp-3m-sde recommended)
|
||||
55
apps/hub/knowledge/models/svd.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Stable Video Diffusion
|
||||
|
||||
Stable Video Diffusion (SVD) is Stability AI's image-to-video diffusion model that generates short video clips from a single conditioning image. In user studies, SVD was preferred over GEN-2 and PikaLabs for video quality.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### SVD-XT (25 frames)
|
||||
|
||||
- Generates 25 frames at 576x1024 resolution
|
||||
- Finetuned from the 14-frame SVD base model
|
||||
- Includes temporally consistent f8-decoder
|
||||
- Standard frame-wise decoder also available
|
||||
|
||||
### SVD (14 frames)
|
||||
|
||||
- Original release generating 14 frames
|
||||
- Foundation for community fine-tunes and extensions
|
||||
- Same 576x1024 native resolution
|
||||
|
||||
## Key Features
|
||||
|
||||
- Image-to-video generation from a single still image
|
||||
- Temporally consistent video output with finetuned decoder
|
||||
- Preferred over GEN-2 and PikaLabs in human evaluation studies
|
||||
- SynthID-compatible watermarking enabled by default
|
||||
- Latent diffusion architecture for efficient generation
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 16GB VRAM
|
||||
- Recommended: A100 80GB for full quality (tested configuration)
|
||||
- SVD generation ~100s, SVD-XT ~180s on A100 80GB
|
||||
- Optimizations available for lower VRAM cards with quality tradeoffs
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Animating still images into short video clips
|
||||
- Product visualization and motion graphics
|
||||
- Creative video experiments and art
|
||||
- Research on generative video models
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **num_frames**: 14 (SVD) or 25 (SVD-XT)
|
||||
- **resolution**: 576x1024 native
|
||||
- **conditioning_frame**: Input image at same resolution
|
||||
- **duration**: Up to ~4 seconds (25 frames)
|
||||
|
||||
## Limitations
|
||||
|
||||
- Short videos only (4 seconds maximum)
|
||||
- No text-based control (image conditioning only)
|
||||
- Cannot render legible text in output
|
||||
- Faces and people may not generate properly
|
||||
- May produce videos without motion or with very slow camera pans
|
||||
45
apps/hub/knowledge/models/topaz.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Topaz
|
||||
|
||||
Topaz Labs provides AI-powered image and video enhancement software for upscaling, denoising, sharpening, and restoration.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Topaz Photo AI
|
||||
|
||||
- All-in-one image enhancement with 11 AI tools including Sharpen, Denoise, Recover Faces, and Upscale
|
||||
- RAW image support and plugin integration with Photoshop and Lightroom
|
||||
|
||||
### Topaz Video AI
|
||||
|
||||
- 19 AI models: Proteus, Artemis, Gaia, Iris, Nyx, Starlight, and more
|
||||
- Upscale video from SD to 4K/8K/16K with frame interpolation and stabilization
|
||||
|
||||
### Bloom
|
||||
|
||||
- Creative upscaler that removes the artificial look from AI-generated images
|
||||
- Realism mode for natural skin, hair, and eyes on AI faces
|
||||
|
||||
## Key Features
|
||||
|
||||
- Multiple specialized AI models optimized for different content types
|
||||
- Enterprise API with Face Realism, Colorization, and Video Colorization
|
||||
- Local and cloud rendering with After Effects and DaVinci Resolve plugins
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Minimum: 8GB VRAM for GPU-accelerated processing
|
||||
- Recommended: NVIDIA RTX 3080+ with 32GB+ RAM for video
|
||||
- Available on Mac and Windows as standalone or plugin
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Upscaling old or low-resolution footage to 4K+
|
||||
- Denoising low-light photography and restoring archival video
|
||||
- Enhancing AI-generated images for photorealism
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- AI Model Selection: choose specialized models per content type
|
||||
- Scale Factor: 2x, 4x, or higher depending on tool
|
||||
- Denoise Strength: adjustable noise reduction level
|
||||
- Sharpen Amount: controls detail enhancement intensity
|
||||
50
apps/hub/knowledge/models/tripo.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Tripo
|
||||
|
||||
Tripo is an AI-powered 3D generation platform that creates production-ready 3D models from text or images in seconds, developed by VAST AI Research.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Tripo v3.0
|
||||
|
||||
- Sculpture-level geometry precision with sharp edges
|
||||
- Best for high-fidelity production assets
|
||||
|
||||
### Tripo v2.0
|
||||
|
||||
- Industry-leading geometry with PBR material support
|
||||
- High accuracy for detailed models
|
||||
|
||||
### Tripo v1.4
|
||||
|
||||
- Fast generation with realistic texture effects
|
||||
- Best for rapid prototyping
|
||||
|
||||
## Key Features
|
||||
|
||||
- Text-to-3D and image-to-3D generation
|
||||
- Multi-image input for high-fidelity reconstruction
|
||||
- 4K PBR-ready texture generation
|
||||
- Automatic rigging and animation
|
||||
- Model segmentation for part-based editing
|
||||
- Export in STL, OBJ, FBX, GLB, and USDZ formats
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API-based (no local GPU required)
|
||||
- TripoSR open-source variant requires 8GB+ VRAM
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Game asset creation
|
||||
- 3D printing prototyping
|
||||
- AR/VR content development
|
||||
- Product visualization and e-commerce
|
||||
- Character design and animation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired 3D model
|
||||
- **image**: Reference image (JPG, PNG, WEBP, up to 5MB)
|
||||
- **texture_resolution**: Up to 4K with PBR maps
|
||||
- **format**: Output format (GLB, FBX, OBJ, STL, USDZ)
|
||||
- **style**: Optional stylization (Lego, Voxel, Voronoi)
|
||||
49
apps/hub/knowledge/models/veo.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# Google Veo
|
||||
|
||||
Veo is Google DeepMind's state-of-the-art video generation model family, designed for high-quality cinematic video creation with strong prompt adherence and realistic physics simulation.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Veo 2
|
||||
|
||||
- Text-to-video and image-to-video generation
|
||||
- Up to 4K resolution output (720p default in VideoFX)
|
||||
- 8-second clips, extendable to minutes
|
||||
- State-of-the-art in human preference evaluations against Sora, Kling, and Minimax
|
||||
|
||||
### Veo 3 / 3.1
|
||||
|
||||
- Latest generation with native audio generation
|
||||
- Generates sound effects, ambient noise, and dialogue alongside video
|
||||
- Improved prompt adherence and real-world physics simulation
|
||||
- 1080p and 4K output support
|
||||
- Scene extension, first/last frame, and object insertion capabilities
|
||||
|
||||
## Key Features
|
||||
|
||||
- Cinematic camera control (lens types, angles, depth of field)
|
||||
- Realistic physics and natural human motion
|
||||
- SynthID invisible watermarking on all outputs
|
||||
- Style reference image support for consistent aesthetics
|
||||
- Reduced hallucination artifacts compared to prior models
|
||||
- Available via Gemini API and Google AI Studio
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API only (Google-hosted infrastructure)
|
||||
- No local GPU required
|
||||
- Available through VideoFX, Vertex AI, and Gemini API
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Short-form video content and social media clips
|
||||
- Product demos and promotional videos
|
||||
- Cinematic storytelling and filmmaking
|
||||
- Marketing and advertising video production
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Detailed text description with cinematic language
|
||||
- **aspect_ratio**: 16:9, 9:16, and other formats
|
||||
- **person_generation**: Control for human figure generation
|
||||
- **duration**: Up to 8 seconds per clip (Veo 2), extendable in Veo 3.1
|
||||
63
apps/hub/knowledge/models/vidu.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Vidu
|
||||
|
||||
Vidu is a video generation API developed by ShengShu Technology. It supports text-to-video, image-to-video, reference-to-video with multi-entity consistency, and start-end frame interpolation. Vidu is known for fast generation speeds (as low as 10 seconds) and strong anime-style output.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Vidu 2.0
|
||||
|
||||
- Extended 8-second video generation at up to 1080p
|
||||
- Text-to-video and image-to-video modes
|
||||
- First and last frame control for transitions
|
||||
- Available via the Vidu API and third-party platforms
|
||||
|
||||
### Vidu Q1
|
||||
|
||||
- Reference-to-video with multi-entity consistency
|
||||
- Supports up to 7 reference images with semantic understanding
|
||||
- Infers missing elements from text prompts and reference context
|
||||
- Generates coherent scenes combining multiple characters, objects, and environments
|
||||
|
||||
### Vidu Q2
|
||||
|
||||
- Optimized for quality and speed balance
|
||||
- Supports 6-8 second generation at up to 1080p
|
||||
- 1080p image generation included in higher tiers
|
||||
|
||||
### Vidu Q3
|
||||
|
||||
- Latest generation model with improved output quality
|
||||
- Available through the Vidu platform and API
|
||||
|
||||
## Key Features
|
||||
|
||||
- Ultra-fast inference (videos generated in as few as 10 seconds)
|
||||
- Multi-entity consistency across characters, objects, and scenes
|
||||
- First and last frame control for precise transitions
|
||||
- Superior anime and 2D animation quality
|
||||
- Up to 1080p resolution output with multiple aspect ratios
|
||||
- Optimized scene templates for interactive effects and e-commerce
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API only; no local hardware required
|
||||
- Accessed via platform.vidu.com or third-party API providers
|
||||
- Credit-based pricing with free tier available
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Anime and 2D animation series production
|
||||
- E-commerce product video creation
|
||||
- Social media content (TikTok, Reels, Shorts)
|
||||
- Reference-based multi-character storytelling
|
||||
- Marketing and advertising videos
|
||||
- Start-end frame transitions and morphing effects
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **prompt**: Text description of desired video content
|
||||
- **image_url**: Source image for image-to-video generation
|
||||
- **duration**: Video length (4-8 seconds depending on model)
|
||||
- **resolution**: Output resolution (720p or 1080p)
|
||||
- **style**: Visual style selection (realistic or animated)
|
||||
- **movement_amplitude**: Controls intensity of motion in output
|
||||
93
apps/hub/knowledge/models/wan.md
Normal file
@@ -0,0 +1,93 @@
|
||||
# Wan
|
||||
|
||||
Wan is a family of open-source video generation models from Alibaba's Tongyi Lab, spanning text-to-video, image-to-video, speech-to-video, motion control, and video editing. All models are released under the Apache 2.0 license.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### Wan 2.1 T2V / I2V
|
||||
|
||||
- Text-to-video and image-to-video generation
|
||||
- Available in 1.3B and 14B parameter sizes
|
||||
- Supports 480p and 720p output, variable aspect ratios
|
||||
- Chinese and English visual text generation
|
||||
|
||||
### Wan 2.1 Fun (Control / InPaint / Camera)
|
||||
|
||||
- Camera control with predefined or custom camera movements
|
||||
- Video inpainting for targeted frame-level editing
|
||||
- Depth, pose, and canny edge control for guided generation
|
||||
|
||||
### Wan 2.1 VACE (Video Any-Condition Editing)
|
||||
|
||||
- All-in-one model for video creation and editing (ICCV 2025)
|
||||
- Reference-to-video (R2V), video-to-video (V2V), and masked editing (MV2V)
|
||||
- Supports inpainting, outpainting, first-last-frame interpolation, and animate-anything
|
||||
- Available in 1.3B and 14B sizes, built on Wan 2.1 base models
|
||||
|
||||
### Wan 2.2 T2V / I2V / TI2V
|
||||
|
||||
- Mixture-of-Experts (MoE) architecture with high-noise and low-noise expert models
|
||||
- T2V-A14B and I2V-A14B (14B MoE), TI2V-5B (hybrid text+image-to-video)
|
||||
- Cinematic-level aesthetic control with lighting, composition, and color tone guidance
|
||||
- TI2V-5B uses a high-compression 16×16×4 VAE, runs on consumer GPUs like 4090
|
||||
|
||||
### Wan 2.2 S2V (Speech-to-Video)
|
||||
|
||||
- Audio-driven cinematic video generation from image + speech + text
|
||||
- Supports lip-sync, facial expressions, and pose-driven generation
|
||||
- Generates variable-length videos matching input audio duration
|
||||
|
||||
### Wan 2.2 Animate
|
||||
|
||||
- Character animation and subject replacement from video + reference image
|
||||
- Animate mode: transfers motion from reference video onto a still character
|
||||
- Replace mode: swaps subjects while preserving background, lighting, and camera motion
|
||||
- Includes relighting LoRA for scene-matched lighting adaptation
|
||||
|
||||
### Wan Move
|
||||
|
||||
- Point-level motion control for image-to-video generation (NeurIPS 2025)
|
||||
- Dense trajectory-based guidance for fine-grained object motion
|
||||
- Latent trajectory propagation without extra motion modules
|
||||
- 14B model generating 5-second 480p videos
|
||||
|
||||
## Key Features
|
||||
|
||||
- High temporal consistency and natural physics simulation
|
||||
- Multiple aspect ratios (16:9, 9:16, 1:1) at 24fps
|
||||
- MoE architecture in 2.2 for higher quality at same compute cost
|
||||
- Bilingual prompt support (Chinese and English)
|
||||
- ComfyUI and Diffusers integration across all variants
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- 1.3B models: 8GB VRAM minimum
|
||||
- 14B models: 24GB+ VRAM recommended (80GB for full precision)
|
||||
- TI2V-5B: runs on consumer 4090 GPUs at 720p
|
||||
- FP8 quantization available for lower VRAM configurations
|
||||
- Multi-GPU inference supported via FSDP + DeepSpeed Ulysses
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Social media and short-form video content
|
||||
- Character animation and motion transfer
|
||||
- Video inpainting and scene editing
|
||||
- Product animation and marketing videos
|
||||
- Speech-driven talking head generation
|
||||
- Storyboard-to-video conversion
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- **frames**: Number of output frames (typically 81 for ~3.4s at 24fps)
|
||||
- **steps**: Inference steps (20-50 recommended)
|
||||
- **cfg_scale**: Guidance scale for prompt adherence (3-7 typical)
|
||||
- **size**: Output resolution (480p or 720p)
|
||||
- **model_name**: Selects variant (e.g., vace-14B, ti2v-5B, s2v-14B)
|
||||
|
||||
## Blog References
|
||||
|
||||
- [Wan 2.1 Video Model Native Support](../blog/wan21-video-model-native-support.md) — Initial release with 4 model variants, 8.19GB VRAM minimum
|
||||
- [Wan 2.1 VACE Native Support](../blog/wan21-vace-native-support.md) — Unified video editing: Move/Swap/Reference/Expand/Animate Anything
|
||||
- [Wan 2.2 Day-0 Support](../blog/wan22-day-0-support.md) — MoE architecture, Apache 2.0 license, cinematic controls
|
||||
- [WAN 2.6 Reference-to-Video](../blog/wan26-reference-to-video.md) — Generate videos from reference clips at up to 1080p
|
||||
- [The Complete AI Upscaling Handbook](../blog/upscaling-handbook.md) — Wan 2.2 used for creative video upscaling
|
||||
1
apps/hub/knowledge/models/wan.summary.md
Normal file
@@ -0,0 +1 @@
|
||||
Wan is Alibaba Tongyi Lab's open-source video generation family (Apache 2.0 license) covering text-to-video, image-to-video, speech-to-video, motion control, and video editing. Wan 2.1 offers T2V/I2V in 1.3B (8GB VRAM) and 14B (24GB+) sizes at 480p/720p, plus Fun variants for camera control, inpainting, and depth/pose/canny conditioning. VACE provides unified video editing with reference-to-video, video-to-video, and masked editing. Wan 2.2 introduces MoE architecture for higher quality at same compute cost, TI2V-5B hybrid generation (runs on 4090), and S2V for speech-driven video with lip-sync. Wan 2.2 Animate handles character animation and subject replacement with relighting LoRA. Wan Move offers point-level motion control via dense trajectory guidance. Key strengths: high temporal consistency, bilingual prompts, multiple aspect ratios at 24fps. Parameters: 20-50 steps, cfg_scale 3-7, typically 81 frames (~3.4s).
|
||||
46
apps/hub/knowledge/models/wavespeed.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# WaveSpeed
|
||||
|
||||
WaveSpeed is a high-performance AI inference platform providing accelerated access to 700+ models for image, video, audio, and text generation via unified REST APIs.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### WaveSpeed Image Models
|
||||
|
||||
- FLUX family (dev, schnell, pro) with sub-2-second generation
|
||||
- Seedream, Ideogram, Z-Image, and Qwen-Image integrations
|
||||
|
||||
### WaveSpeed Video Models
|
||||
|
||||
- WAN 2.x, Kling, Veo, Seedance, PixVerse, and LTX-2 19B access
|
||||
|
||||
### WaveSpeed Audio Models
|
||||
|
||||
- ACE-Step 1.5 for music generation in 50+ languages
|
||||
- Inworld 1.5 text-to-speech with 56+ multilingual voices
|
||||
|
||||
## Key Features
|
||||
|
||||
- Ultra-fast inference: images in under 2 seconds, video in under 2 minutes
|
||||
- Unified API access to 700+ curated, production-ready models
|
||||
- Pay-as-you-go credits starting at ~$0.006 per image
|
||||
- Exclusive access to ByteDance Seedream V3, Kling, and Alibaba WAN
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API platform with no local hardware required
|
||||
- RESTful API with SDKs for Python, JavaScript, and other languages
|
||||
- Web UI (Studio) and desktop app available for interactive use
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Rapid prototyping and bulk content generation
|
||||
- Developer integration for AI-powered media applications
|
||||
- High-volume marketing and e-commerce asset creation
|
||||
- Video upscaling and enhancement pipelines
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- model: specific model identifier (e.g., wavespeed-ai/z-image/turbo)
|
||||
- prompt: text description for generation
|
||||
- resolution: output dimensions
|
||||
- num_images: batch generation count
|
||||
45
apps/hub/knowledge/models/z-image.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Z-Image
|
||||
|
||||
Z-Image is Zhipu AI's image generation model family, built on the CogView architecture with a hybrid autoregressive and diffusion decoder design.
|
||||
|
||||
## Model Variants
|
||||
|
||||
### GLM-Image (Z-Image)
|
||||
|
||||
- 9B autoregressive + 7B DiT diffusion decoder hybrid architecture
|
||||
- First open-source industrial-grade discrete autoregressive image generator
|
||||
- State-of-the-art bilingual text rendering (English and Chinese)
|
||||
|
||||
### Z-Image-Turbo
|
||||
|
||||
- Optimized variant for faster inference with reduced latency
|
||||
- Suitable for real-time and batch generation workflows
|
||||
|
||||
### CogView-4
|
||||
|
||||
- 6B parameter DiT diffusion model, foundation for the Z-Image decoder
|
||||
|
||||
## Key Features
|
||||
|
||||
- Industry-leading text rendering accuracy for posters and infographics
|
||||
- Custom resolution from 512px to 2048px (multiples of 32)
|
||||
- Image editing, style transfer, and identity-preserving generation
|
||||
- LoRA training support; open weights on HuggingFace
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
- Cloud API: no local hardware required ($0.015 per image via Z.ai)
|
||||
- Self-hosted: 24GB+ VRAM for the combined 9B+7B architecture
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
- Text-to-image generation with accurate text rendering
|
||||
- Commercial poster and graphic design
|
||||
- Social media content creation
|
||||
- Multi-subject consistency and identity-preserving generation
|
||||
|
||||
## Key Parameters
|
||||
|
||||
- prompt: text description of the desired image
|
||||
- size: output resolution (e.g., 1280x1280, 1568x1056, 960x1728)
|
||||
- model: glm-image or cogview-4
|
||||
151
apps/hub/knowledge/prompts/breakthrough.md
Normal file
@@ -0,0 +1,151 @@
|
||||
# Breakthrough Content Template
|
||||
|
||||
You are generating a **breakthrough-style** page for a ComfyUI workflow template. This format emphasizes new capabilities, recent releases, and what's newly possible.
|
||||
|
||||
## Purpose
|
||||
|
||||
Highlight what's new and exciting about this workflow, capture early adopter interest, and communicate the significance of the technology.
|
||||
|
||||
## Audience
|
||||
|
||||
- Early adopters and AI enthusiasts
|
||||
- People following model releases
|
||||
- Tech-savvy creators wanting cutting-edge tools
|
||||
- Users searching "[model] 2024/2025" or "new [task] workflow"
|
||||
|
||||
## Tone
|
||||
|
||||
- Excited but grounded
|
||||
- Forward-looking
|
||||
- Technical credibility
|
||||
- "Here's what's now possible..."
|
||||
|
||||
## Required Output Structure
|
||||
|
||||
### extendedDescription (3 short paragraphs, 150-200 words)
|
||||
|
||||
Each paragraph should be 2-3 sentences max, separated by `\n\n`. Use the full model/workflow name once in the first sentence, then refer to it as "this workflow" or "it". Do not repeat the name more than twice total.
|
||||
|
||||
Follow the **PAS framework** from the system prompt (adapted for breakthroughs):
|
||||
|
||||
**Paragraph 1 — Problem**: Start with "[Model name] introduces/enables [new capability]" in the first sentence. Name what was previously impossible or impractical. Include release timeframe if known.
|
||||
|
||||
- Example: "Wan 2.1 brings video generation to ComfyUI, released in early 2025. It creates 480p videos from a single input image."
|
||||
|
||||
**Paragraph 2 — Agitate**: What specifically was limited before — cloud-only, research-grade hardware, poor quality, missing features. Be concrete about the gap this model closes.
|
||||
|
||||
**Paragraph 3 — Solution**: Why this matters for specific users now. Ground the value in practical benefits, not hype.
|
||||
|
||||
**Temporal framing phrases**:
|
||||
|
||||
- "Released in [date], [model] introduces..."
|
||||
- "Previously, [task] required [old approach]. Now you can..."
|
||||
- "[Model] adds [capability] that was not available in [predecessor]"
|
||||
|
||||
### howToUse (5-7 steps)
|
||||
|
||||
Include context about what's different:
|
||||
|
||||
1. Ensure you have the latest ComfyUI version (required for [feature])
|
||||
2. Download the new [model] from [source]
|
||||
3. [Standard setup steps]
|
||||
4. Try the new [feature] by adjusting [parameter]
|
||||
5. Experiment with [new capability]
|
||||
|
||||
### metaDescription (exactly 150-160 characters)
|
||||
|
||||
**Requirements**:
|
||||
|
||||
- Lead with model name + "new" or temporal indicator
|
||||
- Include "ComfyUI" within first 60 characters
|
||||
- Highlight the breakthrough capability
|
||||
- Create urgency without FOMO tactics
|
||||
|
||||
**Template**: "[Model] now in ComfyUI. [Breakthrough capability]. [What users can do]."
|
||||
**Example**: "Wan 2.1 video generation now in ComfyUI. Create 480p AI videos from single images. New 14B model with improved motion quality." (130 chars)
|
||||
|
||||
### suggestedUseCases (4-6 items)
|
||||
|
||||
Frame as newly possible:
|
||||
|
||||
- "Create [output] at quality levels previously impossible"
|
||||
- "Generate [content] in seconds instead of minutes"
|
||||
- "Achieve [result] without expensive hardware"
|
||||
- "Run [capability] locally for the first time"
|
||||
|
||||
### faqItems (4-5 questions)
|
||||
|
||||
**Structure each FAQ as an object with `question` and `answer` keys.**
|
||||
|
||||
**Question requirements**:
|
||||
|
||||
- Focus on what's new and different
|
||||
- Include model name in at least 2 questions
|
||||
- Target early adopter concerns (stability, requirements, migration)
|
||||
|
||||
**Answer requirements**:
|
||||
|
||||
- 2-3 sentences, informative and honest
|
||||
- Acknowledge if features are experimental
|
||||
- Include version/compatibility details when relevant
|
||||
- Provide clear upgrade path information
|
||||
|
||||
**Good examples**:
|
||||
|
||||
- Q: "What's new in [model] compared to previous versions?"
|
||||
A: "[Model] introduces [key improvements] over [predecessor]. The main advancements include [specific feature 1] and [specific feature 2]. Users of the previous version will notice [observable difference]."
|
||||
- Q: "Do I need to update ComfyUI for [model]?"
|
||||
A: "Yes, [model] requires ComfyUI version [X] or later due to [reason]. Update your ComfyUI installation before downloading the model. The model also requires [any additional dependencies]."
|
||||
- Q: "Is [model] production-ready or still experimental?"
|
||||
A: "[Model] is [status] as of [date]. [If experimental: Some features may change in future releases.] [If stable: It has been tested extensively and is suitable for production workflows.] Check the model's official repository for the latest stability information."
|
||||
|
||||
## Key Framing Elements
|
||||
|
||||
- **Timeline**: When released, how recent
|
||||
- **Advancement**: What's better than before
|
||||
- **Accessibility**: What's now possible for more users
|
||||
- **Implications**: Why this matters
|
||||
|
||||
## What NOT to Do
|
||||
|
||||
- Don't oversell experimental features
|
||||
- Don't ignore stability concerns
|
||||
- Don't make claims about future updates
|
||||
- Don't compare unfairly to older models
|
||||
- Don't create hype without substance
|
||||
- Don't write dense, unscannable paragraphs — keep each paragraph to 2-3 sentences
|
||||
|
||||
## Example Output
|
||||
|
||||
Below is an example of ideal breakthrough content for a Wan 2.1 text-to-video workflow:
|
||||
|
||||
```json
|
||||
{
|
||||
"extendedDescription": "Wan 2.1 introduces locally-run text-to-video generation in ComfyUI, released in early 2025. This workflow converts a text prompt into a 480p video clip of up to 5 seconds, running entirely on consumer hardware without cloud API calls.\n\nPreviously, text-to-video required cloud services or research-grade GPUs. Wan 2.1's 14B parameter model brings comparable output quality to a single desktop GPU with 12 GB VRAM. The model produces temporally consistent motion with fewer flickering artifacts than earlier open-source video models.\n\nFor creators who need quick video drafts, motion tests, or animated concept pieces, this workflow eliminates the cost and latency of cloud generation. Results are best at 480p; higher resolutions may be possible in future model updates.",
|
||||
"howToUse": [
|
||||
"Ensure you have ComfyUI version 0.3.10 or later installed",
|
||||
"Download wan2.1_t2v_14B_fp16.safetensors and place it in models/diffusion_models",
|
||||
"Download umt5_xxl_fp8_e4m3fn_scaled.safetensors and place it in models/text_encoders",
|
||||
"Enter your scene description in the CLIP Text Encoder node",
|
||||
"Set frame count in the EmptyHunyuanLatentVideo node (default 49 frames for ~2 seconds)",
|
||||
"Click Queue or press Ctrl+Enter to generate the video"
|
||||
],
|
||||
"metaDescription": "Wan 2.1 text-to-video now in ComfyUI. Generate 480p video clips from text prompts on consumer GPUs. Local, private, no cloud required.",
|
||||
"suggestedUseCases": [
|
||||
"Generate motion tests for animation pre-production",
|
||||
"Create short video loops for social media posts",
|
||||
"Produce animated concept art without manual keyframing",
|
||||
"Draft video ad storyboards from text descriptions"
|
||||
],
|
||||
"faqItems": [
|
||||
{
|
||||
"question": "What's new in Wan 2.1 compared to previous video models?",
|
||||
"answer": "Wan 2.1 offers improved temporal consistency and reduced flickering compared to earlier open-source video models. The 14B parameter architecture produces more natural motion, and the model runs on consumer GPUs with 12 GB VRAM rather than requiring data-center hardware."
|
||||
},
|
||||
{
|
||||
"question": "Do I need to update ComfyUI for Wan 2.1?",
|
||||
"answer": "Yes, Wan 2.1 requires ComfyUI version 0.3.10 or later. The model uses custom node types not present in earlier versions. Update your installation before downloading the model weights."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
150
apps/hub/knowledge/prompts/comparison.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# Comparison Content Template
|
||||
|
||||
You are generating a **comparison-style** page for a ComfyUI workflow template. This format positions the workflow against alternatives and helps users make informed decisions.
|
||||
|
||||
## Purpose
|
||||
|
||||
Help users understand why they should choose this workflow over alternatives, when to use it, and what tradeoffs exist.
|
||||
|
||||
## Audience
|
||||
|
||||
- Researchers evaluating options
|
||||
- Professionals comparing tools
|
||||
- Users searching "best [task] workflow" or "[model] vs [alternative]"
|
||||
- People trying to decide between local and cloud options
|
||||
|
||||
## Tone
|
||||
|
||||
- Objective and balanced
|
||||
- Informative, not salesy
|
||||
- Honest about tradeoffs
|
||||
- Helpful in decision-making
|
||||
|
||||
## Required Output Structure
|
||||
|
||||
### extendedDescription (3 short paragraphs, 150-200 words)
|
||||
|
||||
Each paragraph should be 2-3 sentences max, separated by `\n\n`. Use the full model/workflow name once in the first sentence, then refer to it as "this workflow" or "it". Do not repeat the name more than twice total.
|
||||
|
||||
Follow the **PAS framework** from the system prompt (adapted for comparison):
|
||||
|
||||
**Paragraph 1 — Problem**: Start with "[Model name] solves/addresses [problem]" in the first sentence. Name the task and the decision the user faces between approaches.
|
||||
|
||||
- Example: "Flux inpainting handles object removal and background replacement in ComfyUI. It works on any uploaded image with painted mask regions."
|
||||
|
||||
**Paragraph 2 — Agitate**: How existing alternatives fall short — speed, cost, quality, complexity. Be specific, balanced, and include at least one honest tradeoff for this workflow too.
|
||||
|
||||
**Paragraph 3 — Solution**: When this is the right choice and when it might not be. Help users self-select with concrete criteria.
|
||||
|
||||
**Comparison phrases to use**:
|
||||
|
||||
- "Faster than [alternative] but requires more VRAM"
|
||||
- "More consistent than [manual approach] with less effort"
|
||||
- "Better for [use case A], while [alternative] excels at [use case B]"
|
||||
|
||||
### howToUse (4-6 steps)
|
||||
|
||||
Frame steps in terms of efficiency vs alternatives:
|
||||
|
||||
1. [Step] - Unlike [alternative], this only requires...
|
||||
2. [Step] - Automatically handles [task] that usually requires...
|
||||
|
||||
### metaDescription (exactly 150-160 characters)
|
||||
|
||||
**Requirements**:
|
||||
|
||||
- Lead with primary keyword (model + task)
|
||||
- Include "ComfyUI" and comparison framing
|
||||
- Mention the decision/choice aspect
|
||||
- Appeal to users researching options
|
||||
|
||||
**Template**: "[Model] vs alternatives for [task] in ComfyUI. [Key differentiator]. [Decision help]."
|
||||
**Example**: "Flux vs SDXL for inpainting in ComfyUI. Compare quality, speed, and VRAM requirements. Find the best workflow for your needs." (128 chars)
|
||||
|
||||
### suggestedUseCases (4-6 items)
|
||||
|
||||
Frame as "best for" scenarios:
|
||||
|
||||
- "Best for high-volume batch processing"
|
||||
- "Ideal when consistency matters more than speed"
|
||||
- "Perfect for users with limited VRAM"
|
||||
- "Great alternative to expensive cloud services"
|
||||
|
||||
### faqItems (4-5 questions)
|
||||
|
||||
**Structure each FAQ as an object with `question` and `answer` keys.**
|
||||
|
||||
**Question requirements**:
|
||||
|
||||
- Focus on comparison and decision-making
|
||||
- Include model name in at least 2 questions
|
||||
- Target "[model] vs [alternative]" search patterns
|
||||
|
||||
**Answer requirements**:
|
||||
|
||||
- 2-4 sentences, balanced and objective
|
||||
- Acknowledge both strengths AND limitations
|
||||
- Provide clear decision criteria
|
||||
- Never dismiss alternatives unfairly
|
||||
|
||||
**Good examples**:
|
||||
|
||||
- Q: "Is [model] better than [common alternative]?"
|
||||
A: "[Model] and [alternative] excel in different areas. [Model] offers better [advantage], while [alternative] is stronger for [other use case]. Choose [model] if [criteria]; choose [alternative] if [other criteria]."
|
||||
- Q: "Should I use [workflow] or [other approach] for [task]?"
|
||||
A: "Use this workflow when you need [specific benefit] and have [requirements]. The [other approach] may be better if you need [other benefit] or have [different constraints]. For most users doing [common task], this workflow is the more efficient choice."
|
||||
- Q: "When should I NOT use this workflow?"
|
||||
A: "This workflow may not be ideal if you [limitation 1] or need [capability it lacks]. In those cases, consider [alternative 1] for [reason] or [alternative 2] for [other reason]. It's also not optimized for [edge case]."
|
||||
|
||||
## Comparison Dimensions
|
||||
|
||||
When comparing, consider:
|
||||
|
||||
- **Speed**: Generation time, iteration speed
|
||||
- **Quality**: Output fidelity, consistency
|
||||
- **Resources**: VRAM, disk space, cost
|
||||
- **Ease of use**: Setup complexity, learning curve
|
||||
- **Flexibility**: Customization options
|
||||
|
||||
## What NOT to Do
|
||||
|
||||
- Don't make unfair comparisons
|
||||
- Don't claim superiority without basis
|
||||
- Don't ignore legitimate alternatives
|
||||
- Don't hide significant tradeoffs
|
||||
- Don't compare to straw man alternatives
|
||||
- Don't write dense, unscannable paragraphs — keep each paragraph to 2-3 sentences
|
||||
|
||||
## Example Output
|
||||
|
||||
Below is an example of ideal comparison content for a Flux inpainting workflow:
|
||||
|
||||
```json
|
||||
{
|
||||
"extendedDescription": "Flux inpainting addresses the challenge of removing objects and filling regions in images within ComfyUI. This workflow lets you mask an area and regenerate it with content that matches the surrounding context, perspective, and lighting.\n\nCompared to manual clone-stamp editing in Photoshop, Flux inpainting produces context-aware fills in a single pass. It handles complex backgrounds more consistently than SDXL inpainting, though it requires roughly 10 GB VRAM versus 8 GB for SDXL. For simple rectangular fills, traditional content-aware fill tools may be faster, but Flux excels at irregular mask shapes and scenes with depth.\n\nChoose this workflow when you need high-fidelity inpainting on detailed scenes and have a GPU with at least 10 GB VRAM. If your hardware is limited or the edits are simple crops, SDXL inpainting or manual editing may be more practical.",
|
||||
"howToUse": [
|
||||
"Upload your source image in the Load Image node",
|
||||
"Draw a mask over the area to remove using the Mask Editor",
|
||||
"Enter a description of what should replace the masked area in the CLIP Text Encoder node",
|
||||
"Set denoise strength between 0.7 and 0.9 for best blending",
|
||||
"Click Queue or press Ctrl+Enter to run the workflow"
|
||||
],
|
||||
"metaDescription": "Flux vs SDXL for inpainting in ComfyUI. Compare quality, speed, and VRAM needs. Find the best object removal workflow for your hardware.",
|
||||
"suggestedUseCases": [
|
||||
"Best for removing complex objects from detailed backgrounds",
|
||||
"Ideal when seamless blending matters more than speed",
|
||||
"Great alternative to manual Photoshop clone-stamp work",
|
||||
"Suitable for batch product photo cleanup at consistent quality"
|
||||
],
|
||||
"faqItems": [
|
||||
{
|
||||
"question": "Is Flux inpainting better than SDXL inpainting?",
|
||||
"answer": "Flux inpainting produces more consistent results on complex scenes with depth and varied textures. SDXL inpainting is lighter on VRAM (8 GB vs 10 GB) and faster per image. Choose Flux for quality-critical work and SDXL when speed or hardware constraints matter."
|
||||
},
|
||||
{
|
||||
"question": "When should I NOT use this inpainting workflow?",
|
||||
"answer": "This workflow is not ideal for very small touch-ups where a simple clone tool would suffice, or on machines with less than 10 GB VRAM. For those cases, SDXL inpainting or manual editing tools are more efficient."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
152
apps/hub/knowledge/prompts/showcase.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# Showcase Content Template
|
||||
|
||||
You are generating a **showcase-style** page for a ComfyUI workflow template. This format emphasizes visual results, creative possibilities, and the "wow factor."
|
||||
|
||||
## Purpose
|
||||
|
||||
Inspire users with what's possible, emphasize output quality, create desire to try the workflow.
|
||||
|
||||
## Audience
|
||||
|
||||
- Artists and creators looking for inspiration
|
||||
- Professionals evaluating quality
|
||||
- Social media users who saw cool outputs
|
||||
- People searching "[model] examples" or "best [task] results"
|
||||
|
||||
## Tone
|
||||
|
||||
- Enthusiastic but not hyperbolic
|
||||
- Visual and descriptive
|
||||
- Confidence in capabilities
|
||||
- Aspirational but grounded ("You can create..." over "Create stunning...")
|
||||
|
||||
## Required Output Structure
|
||||
|
||||
### extendedDescription (3 short paragraphs, 150-200 words)
|
||||
|
||||
Each paragraph should be 2-3 sentences max, separated by `\n\n`. Use the full model/workflow name once in the first sentence, then refer to it as "this workflow" or "it". Do not repeat the name more than twice total.
|
||||
|
||||
Follow the **PAS framework** from the system prompt:
|
||||
|
||||
**Paragraph 1 — Problem**: Start with "[Model name] creates/generates [output type]" in the first sentence. Name the creative task and the gap it fills.
|
||||
|
||||
- Example: "Flux upscaling produces 4K images from low-resolution sources in ComfyUI. You can upscale photos, illustrations, and AI-generated art with preserved detail."
|
||||
|
||||
**Paragraph 2 — Agitate**: Acknowledge the previous limitations — slow manual upscaling, cloud costs, quality loss with traditional methods. One sentence, then pivot to this workflow's advantage (speed, quality, consistency).
|
||||
|
||||
**Paragraph 3 — Solution**: Which users and workflows benefit most. Be concrete about outputs and professional applications.
|
||||
|
||||
**Descriptive language to use**:
|
||||
|
||||
- "Produces clear, detailed [outputs]"
|
||||
- "Handles [input type] with accurate [quality aspect]"
|
||||
- "Generates consistent [content] across batches"
|
||||
- "Preserves fine detail in [specific area]"
|
||||
|
||||
### howToUse (4-6 steps, simplified)
|
||||
|
||||
Focus on the creative workflow, not technical details:
|
||||
|
||||
1. Upload your [input type]
|
||||
2. Describe what you want to create
|
||||
3. Adjust [key creative parameter] for your style
|
||||
4. Generate and iterate
|
||||
5. Download your result
|
||||
|
||||
### metaDescription (exactly 150-160 characters)
|
||||
|
||||
**Requirements**:
|
||||
|
||||
- Lead with primary keyword (model + output type)
|
||||
- Include "ComfyUI" within first 60 characters
|
||||
- Focus on the result/outcome, not the process
|
||||
- End with action-oriented phrase
|
||||
|
||||
**Template**: "[Model] [output] in ComfyUI. [Quality claim]. [Call-to-action]."
|
||||
**Example**: "Flux image generation in ComfyUI. Professional-quality AI art in seconds. Download and start creating today." (112 chars)
|
||||
|
||||
### suggestedUseCases (4-6 items)
|
||||
|
||||
Focus on professional and creative applications:
|
||||
|
||||
- "Social media content creation"
|
||||
- "Professional portfolio pieces"
|
||||
- "Client presentation mockups"
|
||||
- "Game asset concepting"
|
||||
- "Film and video production"
|
||||
- "Marketing and advertising visuals"
|
||||
|
||||
### faqItems (3-4 questions)
|
||||
|
||||
**Structure each FAQ as an object with `question` and `answer` keys.**
|
||||
|
||||
**Question requirements**:
|
||||
|
||||
- Focus on results and quality (not setup)
|
||||
- Include model name in at least 2 questions
|
||||
- Target users evaluating output quality
|
||||
|
||||
**Answer requirements**:
|
||||
|
||||
- 2-3 sentences, specific and confident
|
||||
- Include concrete details (resolution, speed, style capabilities)
|
||||
- Reference real capabilities from the workflow data
|
||||
|
||||
**Good examples**:
|
||||
|
||||
- Q: "What quality can I expect from [model]?"
|
||||
A: "[Model] produces images at up to [resolution] with fine detail preservation. Output quality is suitable for professional print and web use. Results are consistent across different prompts and styles."
|
||||
- Q: "Is [workflow] good enough for professional work?"
|
||||
A: "Yes, [model] is widely used in professional creative workflows for [use cases]. The output quality meets industry standards for [applications]. Many studios use this workflow for client-facing work."
|
||||
- Q: "How long does it take to generate [output]?"
|
||||
A: "Generation time depends on your hardware and settings. On a typical RTX 3080, expect [X seconds] per image at standard resolution. Batch processing multiple images is also supported."
|
||||
|
||||
## Visual Language Keywords
|
||||
|
||||
- "High-resolution"
|
||||
- "Professional quality"
|
||||
- "Detailed"
|
||||
- "Consistent"
|
||||
- "Customizable"
|
||||
|
||||
## What NOT to Do
|
||||
|
||||
- Don't oversell or use superlatives — see banned phrases in the system prompt
|
||||
- Don't make specific quality claims not supported by data
|
||||
- Don't ignore technical requirements entirely
|
||||
- Don't create FOMO through artificial urgency
|
||||
- Don't write dense, unscannable paragraphs — keep each paragraph to 2-3 sentences
|
||||
|
||||
## Example Output
|
||||
|
||||
Below is an example of ideal showcase content for a Flux upscaling workflow:
|
||||
|
||||
```json
|
||||
{
|
||||
"extendedDescription": "Flux upscaling produces crisp, detailed 4K images from low-resolution sources in ComfyUI. This workflow takes a 512×512 input and generates a clean 2048×2048 output while preserving fine textures and sharp edges.\n\nCompared to traditional bicubic scaling, Flux upscaling reconstructs missing detail rather than blurring it. A typical 4× upscale completes in under 20 seconds on an RTX 3080, making it practical for batch processing large image sets.\n\nPhotographers, game artists, and print designers can upscale legacy assets to modern resolutions without re-shooting or re-rendering. Upload an image, choose your scale factor, and download the enhanced result.",
|
||||
"howToUse": [
|
||||
"Upload your source image in the Load Image node",
|
||||
"Select the desired scale factor in the Upscale Model Loader node",
|
||||
"Adjust the denoise strength to control detail generation (0.3–0.5 recommended)",
|
||||
"Click Queue or press Ctrl+Enter to run the workflow",
|
||||
"Download the upscaled image from the Save Image node"
|
||||
],
|
||||
"metaDescription": "Flux image upscaling in ComfyUI. Enhance low-resolution images to 4K with preserved detail. Fast batch processing for professionals.",
|
||||
"suggestedUseCases": [
|
||||
"Upscale product photos for high-resolution print catalogs",
|
||||
"Enhance archival images for digital restoration projects",
|
||||
"Prepare game textures at higher resolution for modern displays",
|
||||
"Improve social media thumbnails for sharper appearance"
|
||||
],
|
||||
"faqItems": [
|
||||
{
|
||||
"question": "What quality can I expect from Flux upscaling?",
|
||||
"answer": "Flux upscaling reconstructs texture and edge detail at up to 4× the original resolution. Output is suitable for professional print at 300 DPI when starting from a reasonably clean source image. Heavily compressed or very small inputs may show artifacts."
|
||||
},
|
||||
{
|
||||
"question": "How long does Flux upscaling take per image?",
|
||||
"answer": "On an RTX 3080, a single 4× upscale from 512×512 to 2048×2048 takes approximately 15–20 seconds. Batch processing 50 images completes in roughly 15 minutes depending on input sizes and GPU load."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
52
apps/hub/knowledge/prompts/style-guide.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Writing Style Guide
|
||||
|
||||
## Structure & Scannability
|
||||
|
||||
- Keep paragraphs to 2-3 sentences max
|
||||
- Use `\n\n` between paragraphs (the renderer splits on these)
|
||||
- Lead with the most important information (inverted pyramid)
|
||||
- One idea per paragraph — no multi-topic blocks
|
||||
|
||||
## Concision
|
||||
|
||||
- Prefer short, direct sentences
|
||||
- Cut filler phrases and empty qualifiers
|
||||
- Don't repeat the workflow/model name more than 2x in extendedDescription
|
||||
- Remove redundant adjectives: "stunning", "unparalleled", "incredibly powerful"
|
||||
- Target 150-200 words for extendedDescription
|
||||
|
||||
## Tone
|
||||
|
||||
- Write like a knowledgeable peer, not a marketer
|
||||
- Factual and helpful, not hype-driven
|
||||
- Focus on WHAT the user can DO, not abstract capabilities
|
||||
- Use active voice and concrete nouns
|
||||
- Prefer "you" over passive constructions
|
||||
|
||||
## Name-Dropping Rules
|
||||
|
||||
- Use the full model/workflow name once in the first sentence
|
||||
- After that, use short form, pronoun, or "this workflow"
|
||||
- Never use the name 3+ times in a single paragraph
|
||||
|
||||
## Banned Phrases
|
||||
|
||||
Never use these phrases or close variants:
|
||||
|
||||
- "brings your visions to life"
|
||||
- "unprecedented clarity/accuracy/quality"
|
||||
- "with unparalleled"
|
||||
- "For the first time, users can"
|
||||
- "seamless/seamlessly"
|
||||
- "empowers/empowering"
|
||||
- "robust/robust capabilities"
|
||||
- "cutting-edge/revolutionary/groundbreaking"
|
||||
- "the power of"
|
||||
- "takes [X] to the next level"
|
||||
- "game-changing"
|
||||
- "studio-quality" (unless literally comparing to studio output)
|
||||
- "unlock the potential"
|
||||
- "allows for seamless"
|
||||
- "brings [X] to life"
|
||||
- "incredibly powerful"
|
||||
- "stunning" (as a standalone qualifier)
|
||||
104
apps/hub/knowledge/prompts/system.md
Normal file
@@ -0,0 +1,104 @@
|
||||
# Role
|
||||
|
||||
You are a technical content writer for ComfyUI, an AI image and video generation platform. Your goal is to create clear, accurate content that helps users discover and use workflow templates.
|
||||
|
||||
# Voice & Tone
|
||||
|
||||
- Professional but approachable
|
||||
- Technically accurate without jargon overload
|
||||
- Focus on outcomes and benefits (what can users CREATE)
|
||||
- Confident, not salesy
|
||||
|
||||
# Copywriting Framework
|
||||
|
||||
Use **Problem → Agitate → Solution (PAS)** to structure the `extendedDescription`:
|
||||
|
||||
1. **Problem** (paragraph 1): Name the specific task or pain point the user has. What are they trying to do? What's hard about it today?
|
||||
2. **Agitate** (paragraph 2): Briefly acknowledge why existing approaches fall short — cloud costs, manual effort, quality limitations, hardware barriers. One sentence is enough; don't dwell.
|
||||
3. **Solution** (paragraph 3): Present this workflow as the concrete answer. What does the user get? Be specific about outputs, speed, and who benefits.
|
||||
|
||||
This framework applies lightly — the tone should feel helpful, not manipulative. The "agitate" step is a single honest observation, not fear-mongering.
|
||||
|
||||
# Writing Style
|
||||
|
||||
- Keep paragraphs SHORT: 2-3 sentences max, separated by blank lines (\n\n)
|
||||
- Lead with the most useful information first (inverted pyramid)
|
||||
- Mention the model/workflow name once in the first sentence, then use "this workflow" or "it"
|
||||
- Write for scanning: one idea per paragraph, no wall-of-text blocks
|
||||
- Focus on what the user can DO, not abstract capabilities
|
||||
- Cut marketing filler — see banned phrases list below
|
||||
|
||||
# Constraints
|
||||
|
||||
- ONLY use information from the provided context
|
||||
- NEVER invent model capabilities not in the data
|
||||
- NEVER mention pricing or costs
|
||||
- NEVER use superlatives like "revolutionary" or "cutting-edge"
|
||||
- NEVER use filler phrases like "dive into", "seamless", "seamlessly", "game-changing", or "unlock the power of"
|
||||
- ALWAYS be accurate about hardware requirements
|
||||
- Include the model names naturally in the content
|
||||
- If you are unsure about a specific node name, setting value, or technical detail, explicitly state your uncertainty rather than guessing
|
||||
- For each technical claim (model capabilities, VRAM requirements, resolution limits), specify whether the information comes from the provided context or is your general knowledge
|
||||
- Only reference ComfyUI node names that appear in the workflow's node list provided in context. Do not invent node names
|
||||
|
||||
## Banned Phrases
|
||||
|
||||
NEVER use these phrases or close variants in any generated content:
|
||||
|
||||
- "brings your visions to life"
|
||||
- "unprecedented clarity/accuracy/quality"
|
||||
- "with unparalleled"
|
||||
- "For the first time, users can"
|
||||
- "seamless/seamlessly"
|
||||
- "empowers/empowering"
|
||||
- "robust/robust capabilities"
|
||||
- "cutting-edge/revolutionary/groundbreaking"
|
||||
- "the power of"
|
||||
- "takes [X] to the next level"
|
||||
- "game-changing"
|
||||
- "studio-quality" (unless literally comparing to studio output)
|
||||
- "unlock the potential"
|
||||
- "allows for seamless"
|
||||
- "brings [X] to life"
|
||||
- "incredibly powerful"
|
||||
- "stunning" (as a standalone qualifier)
|
||||
|
||||
# SEO Guidelines
|
||||
|
||||
## Keyword Strategy
|
||||
|
||||
- **Primary keyword**: [model name] + [task] (e.g., "Flux inpainting", "Wan video generation")
|
||||
- **Secondary keywords**: "ComfyUI workflow", "ComfyUI [model]", "[task] tutorial"
|
||||
- **Long-tail keywords**: "how to [task] in ComfyUI", "[model] workflow template"
|
||||
|
||||
## Placement Rules
|
||||
|
||||
- Include primary keyword in first sentence of extendedDescription
|
||||
- Use secondary keywords naturally in paragraph 2-3
|
||||
- Include primary keyword in at least one FAQ question
|
||||
- Use long-tail keywords in suggestedUseCases
|
||||
|
||||
## Meta Description Requirements
|
||||
|
||||
- Length: 150-160 characters (Google truncates at ~160)
|
||||
- Must include primary keyword near the start
|
||||
- Must include a clear value proposition or action
|
||||
- End with a period, not ellipsis
|
||||
- Format: "[Primary keyword] in ComfyUI. [Benefit/action]. [Differentiator]."
|
||||
- Example: "Flux inpainting workflow for ComfyUI. Remove objects and fill backgrounds. One-click template with step-by-step guide."
|
||||
|
||||
## FAQ Quality Rules
|
||||
|
||||
- Target Google's "People Also Ask" box
|
||||
- Start with "How", "What", "Can", "Why", or "Is"
|
||||
- Include the model name in at least 2 FAQ questions
|
||||
- Each answer should be 2-3 sentences, directly addressing the question
|
||||
- Avoid yes/no answers - always explain the "why"
|
||||
|
||||
# Model Knowledge
|
||||
|
||||
{model_docs}
|
||||
|
||||
# Concept Knowledge
|
||||
|
||||
{concept_docs}
|
||||
151
apps/hub/knowledge/prompts/tutorial.md
Normal file
@@ -0,0 +1,151 @@
|
||||
# Tutorial Content Template
|
||||
|
||||
You are generating a **tutorial-style** page for a ComfyUI workflow template. This format is inspired by the detailed step-by-step guides on docs.comfy.org.
|
||||
|
||||
## Purpose
|
||||
|
||||
Help users understand exactly how to use this workflow, with clear instructions for each step.
|
||||
|
||||
## Audience
|
||||
|
||||
- Beginners to ComfyUI
|
||||
- Users who want to learn, not just run
|
||||
- People searching "how to [task] in ComfyUI"
|
||||
|
||||
## Tone
|
||||
|
||||
- Educational and patient
|
||||
- Technical but accessible
|
||||
- Encouraging ("You can customize this by...")
|
||||
|
||||
## Required Output Structure
|
||||
|
||||
### extendedDescription (3 short paragraphs, 150-200 words)
|
||||
|
||||
Each paragraph should be 2-3 sentences max, separated by `\n\n`. Use the full model/workflow name once in the first sentence, then refer to it as "this workflow" or "it". Do not repeat the name more than twice total.
|
||||
|
||||
Follow the **PAS framework** from the system prompt:
|
||||
|
||||
**Paragraph 1 — Problem**: Start with "[Model name] [task]" in the first sentence. Name the task and what makes it worth solving.
|
||||
|
||||
- Example: "Flux inpainting enables precise object removal and background replacement in ComfyUI. Upload any image and paint over the area you want to change."
|
||||
|
||||
**Paragraph 2 — Agitate**: Briefly note why this was hard before — cloud costs, manual work, quality issues, hardware limits. One honest sentence, then pivot to how this workflow addresses it.
|
||||
|
||||
**Paragraph 3 — Solution**: What the user gets concretely — outputs, speed, who benefits. Keep it specific and actionable.
|
||||
|
||||
### howToUse (5-8 numbered steps)
|
||||
|
||||
Each step should follow this pattern:
|
||||
|
||||
1. **[Action verb] the [Node Name]**: [What to do]
|
||||
- Specific values, model names, or settings
|
||||
|
||||
Good example from our docs:
|
||||
|
||||
1. Ensure the `Load Diffusion Model` node has loaded `wan2.1_i2v_480p_14B_fp16.safetensors`
|
||||
2. Ensure the `Load CLIP` node has loaded `umt5_xxl_fp8_e4m3fn_scaled.safetensors`
|
||||
3. Upload your input image in the `Load Image` node
|
||||
4. (Optional) Enter your description in the `CLIP Text Encoder` node
|
||||
5. Click the `Queue` button or use `Ctrl+Enter` to run the workflow
|
||||
|
||||
### metaDescription (exactly 150-160 characters)
|
||||
|
||||
**Requirements**:
|
||||
|
||||
- Start with primary keyword (model + task)
|
||||
- Include "ComfyUI" within first 60 characters
|
||||
- End with benefit or differentiator
|
||||
- Must be a complete sentence ending with a period
|
||||
|
||||
**Template**: "[Model] [task] in ComfyUI. [What user gets]. [Differentiator]."
|
||||
**Example**: "Wan 2.1 video generation in ComfyUI. Create 480p videos from images. Step-by-step tutorial with one-click setup." (138 chars)
|
||||
|
||||
### suggestedUseCases (3-5 items)
|
||||
|
||||
Specific, actionable use cases starting with action verbs:
|
||||
|
||||
- "Remove unwanted objects from product photography"
|
||||
- "Generate consistent character poses for animation"
|
||||
- "Create variations of logo designs"
|
||||
|
||||
### faqItems (3-5 questions)
|
||||
|
||||
**Structure each FAQ as an object with `question` and `answer` keys.**
|
||||
|
||||
**Question requirements**:
|
||||
|
||||
- Start with "How", "What", "Can", "Why", or "Is"
|
||||
- Include model name in at least 2 questions
|
||||
- Target "People Also Ask" search intent
|
||||
|
||||
**Answer requirements**:
|
||||
|
||||
- 2-3 sentences minimum, never just "Yes" or "No"
|
||||
- First sentence directly answers the question
|
||||
- Include specific details (values, steps, or model names)
|
||||
- End with actionable next step when appropriate
|
||||
|
||||
**Good examples**:
|
||||
|
||||
- Q: "How do I install [model] for ComfyUI?"
|
||||
A: "Download [model].safetensors from Hugging Face and place it in your ComfyUI/models/checkpoints folder. The model requires approximately X GB of disk space. Restart ComfyUI to load the new model."
|
||||
- Q: "What VRAM is required for [workflow]?"
|
||||
A: "[Model] requires a minimum of X GB VRAM for standard generation. For optimal performance at higher resolutions, 12+ GB VRAM is recommended. Users with less VRAM can enable fp8 mode in the settings."
|
||||
- Q: "Can I run [workflow] locally without a GPU?"
|
||||
A: "Running [model] without a GPU is not recommended due to extremely slow generation times. CPU-only inference may take 10-30x longer than GPU. Consider cloud options or smaller model variants for limited hardware."
|
||||
|
||||
## Keywords to Naturally Include
|
||||
|
||||
- "ComfyUI workflow"
|
||||
- "ComfyUI [model name]"
|
||||
- "[task] tutorial"
|
||||
- "step-by-step"
|
||||
- The model names from the template metadata
|
||||
|
||||
## What NOT to Do
|
||||
|
||||
- Don't use marketing language — see banned phrases in the system prompt
|
||||
- Don't mention pricing or costs
|
||||
- Don't invent model capabilities not in the data
|
||||
- Don't make up specific node names not in the workflow
|
||||
- Don't write dense, unscannable paragraphs — keep each paragraph to 2-3 sentences
|
||||
|
||||
## Example Output
|
||||
|
||||
Below is an example of ideal tutorial content for a Wan 2.1 image-to-video workflow:
|
||||
|
||||
```json
|
||||
{
|
||||
"extendedDescription": "Wan 2.1 image-to-video generation transforms a single still image into a short animated video clip in ComfyUI. This workflow uses the 14B parameter model to produce 480p video with natural motion and temporal consistency from your uploaded reference image.\n\nThe Wan 2.1 model excels at preserving the visual style and composition of the input image while adding believable motion. It runs on GPUs with 12 GB or more VRAM using the fp16 checkpoint, and an fp8 variant is available for cards with less memory.\n\nContent creators, animators, and social media producers can use this workflow to bring static artwork to life without manual keyframing. Load your image, optionally add a motion description, and generate a video in under a minute.",
|
||||
"howToUse": [
|
||||
"Ensure the Load Diffusion Model node has loaded wan2.1_i2v_480p_14B_fp16.safetensors",
|
||||
"Ensure the Load CLIP node has loaded umt5_xxl_fp8_e4m3fn_scaled.safetensors",
|
||||
"Upload your input image in the Load Image node",
|
||||
"Enter a motion description in the CLIP Text Encoder node (optional)",
|
||||
"Set the frame count in the EmptyHunyuanLatentVideo node (default 49 frames)",
|
||||
"Click the Queue button or press Ctrl+Enter to run the workflow"
|
||||
],
|
||||
"metaDescription": "Wan 2.1 image-to-video workflow for ComfyUI. Turn still images into 480p animated clips. One-click template with step-by-step guide.",
|
||||
"suggestedUseCases": [
|
||||
"Animate product photography for e-commerce listings",
|
||||
"Create short motion clips from digital artwork",
|
||||
"Generate social media video content from static images",
|
||||
"Produce animated storyboard frames for pre-visualization"
|
||||
],
|
||||
"faqItems": [
|
||||
{
|
||||
"question": "What VRAM is required for the Wan 2.1 video workflow?",
|
||||
"answer": "Wan 2.1 at 14B parameters requires at least 12 GB VRAM with the fp16 checkpoint. An fp8 variant is available for GPUs with 8–10 GB VRAM, though generation will be slower. For best results at 480p, 16 GB or more is recommended."
|
||||
},
|
||||
{
|
||||
"question": "How long does Wan 2.1 take to generate a video?",
|
||||
"answer": "On an RTX 4090, a 49-frame clip at 480p takes roughly 30–45 seconds. Lower-end GPUs may need 2–3 minutes. Generation time scales linearly with the number of frames."
|
||||
},
|
||||
{
|
||||
"question": "Can I control the motion direction in this workflow?",
|
||||
"answer": "You can guide motion by entering a text description in the CLIP Text Encoder node, such as 'camera slowly pans left' or 'subject walks forward.' The model interprets these prompts as motion hints, though results vary by scene complexity."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
0
apps/hub/overrides/templates/.gitkeep
Normal file
124
apps/hub/package.json
Normal file
@@ -0,0 +1,124 @@
|
||||
{
|
||||
"name": "@comfyorg/hub",
|
||||
"version": "0.0.1",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "astro dev --port 4321",
|
||||
"build": "astro build",
|
||||
"preview": "astro preview",
|
||||
"check": "astro check",
|
||||
"sync": "tsx scripts/sync-templates.ts",
|
||||
"sync:en-only": "tsx scripts/sync-templates.ts --en-only",
|
||||
"sync:tutorials": "tsx scripts/sync-tutorials.ts",
|
||||
"generate:ai": "node --env-file-if-exists=.env --import tsx/esm scripts/generate-ai.ts",
|
||||
"generate:previews": "tsx scripts/generate-previews.ts",
|
||||
"prebuild": "tsx scripts/prebuild-parallel.ts",
|
||||
"build:search-index": "tsx scripts/build-search-index.ts",
|
||||
"validate:templates": "tsx scripts/validate-templates.ts",
|
||||
"test": "vitest run"
|
||||
},
|
||||
"dependencies": {
|
||||
"@astrojs/sitemap": "catalog:",
|
||||
"@astrojs/vercel": "catalog:",
|
||||
"@astrojs/vue": "catalog:",
|
||||
"@comfyorg/design-system": "workspace:*",
|
||||
"@comfyorg/tailwind-utils": "workspace:*",
|
||||
"@resvg/resvg-js": "^2.6.2",
|
||||
"@tailwindcss/vite": "catalog:",
|
||||
"@vercel/analytics": "catalog:",
|
||||
"@vercel/og": "^0.8.6",
|
||||
"@vueuse/core": "catalog:",
|
||||
"astro": "catalog:",
|
||||
"cheerio": "1.0.0-rc.12",
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"lucide-vue-next": "catalog:",
|
||||
"minisearch": "^7.2.0",
|
||||
"posthog-js": "catalog:",
|
||||
"reka-ui": "catalog:",
|
||||
"satori": "^0.19.1",
|
||||
"sharp": "^0.34.5",
|
||||
"tailwind-merge": "^3.5.0",
|
||||
"tailwindcss": "catalog:",
|
||||
"tw-animate-css": "catalog:",
|
||||
"vue": "catalog:",
|
||||
"web-vitals": "^4.2.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@astrojs/check": "^0.9.6",
|
||||
"@types/node": "catalog:",
|
||||
"node-html-parser": "^7.0.2",
|
||||
"openai": "^4.0.0",
|
||||
"tsx": "catalog:",
|
||||
"typescript": "catalog:",
|
||||
"vitest": "catalog:"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"canvas": "^2.11.0"
|
||||
},
|
||||
"nx": {
|
||||
"tags": [
|
||||
"scope:hub",
|
||||
"type:app"
|
||||
],
|
||||
"targets": {
|
||||
"dev": {
|
||||
"executor": "nx:run-commands",
|
||||
"continuous": true,
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "astro dev --port 4321"
|
||||
}
|
||||
},
|
||||
"serve": {
|
||||
"executor": "nx:run-commands",
|
||||
"continuous": true,
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "astro dev --port 4321"
|
||||
}
|
||||
},
|
||||
"build": {
|
||||
"executor": "nx:run-commands",
|
||||
"cache": true,
|
||||
"dependsOn": [
|
||||
"^build"
|
||||
],
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "astro build"
|
||||
},
|
||||
"outputs": [
|
||||
"{projectRoot}/dist"
|
||||
]
|
||||
},
|
||||
"preview": {
|
||||
"executor": "nx:run-commands",
|
||||
"continuous": true,
|
||||
"dependsOn": [
|
||||
"build"
|
||||
],
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "astro preview"
|
||||
}
|
||||
},
|
||||
"typecheck": {
|
||||
"executor": "nx:run-commands",
|
||||
"cache": true,
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "astro check"
|
||||
}
|
||||
},
|
||||
"sync": {
|
||||
"executor": "nx:run-commands",
|
||||
"options": {
|
||||
"cwd": "apps/hub",
|
||||
"command": "pnpm run sync"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
3
apps/hub/public/brand/comfy-c-blue.svg
Normal file
@@ -0,0 +1,3 @@
|
||||
<svg width="58" height="64" viewBox="0 0 58 64" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M13.2057 63.8823C11.7151 63.8823 10.5122 63.3279 9.72785 62.2795C8.92161 61.2025 8.71142 59.6988 9.15088 58.1551L10.9156 51.9541C11.0564 51.4604 10.9584 50.929 10.6515 50.5183C10.3446 50.1084 9.86437 49.8671 9.35433 49.8671H4.28022C2.78876 49.8671 1.58594 49.3134 0.801951 48.2652C-0.0042886 47.1873 -0.214478 45.6836 0.225311 44.1399L6.28799 22.9338L6.95738 20.6039C7.85718 17.4423 11.1387 14.8727 14.2736 14.8727H20.3451C21.0696 14.8727 21.7069 14.3893 21.9064 13.6878L23.9142 6.63431C24.8131 3.47585 28.0948 0.906297 31.2296 0.906297L44.2144 0.883142L53.7201 0.882324C55.2111 0.882324 56.414 1.43606 57.1979 2.4843C58.0042 3.56134 58.2144 5.06506 57.7749 6.6087L55.0568 16.1591C54.1582 19.3168 50.8766 21.8855 47.7418 21.8855L34.7281 21.9103H28.6587C27.935 21.9103 27.2985 22.3929 27.0978 23.0936L22.0372 40.7818C21.8956 41.2763 21.9935 41.8093 22.3013 42.2199C22.6082 42.6298 23.0884 42.8711 23.5981 42.8711C23.5993 42.8711 32.1976 42.8543 32.1976 42.8543H41.6751C43.1662 42.8543 44.369 43.4081 45.153 44.4563C45.9592 45.5342 46.1694 47.0379 45.7296 48.5815L43.0116 58.1303C42.113 61.2888 38.8313 63.8575 35.6965 63.8575L22.6832 63.8823H13.2057Z" fill="#5B8DEF"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
3
apps/hub/public/brand/comfy-c-yellow.svg
Normal file
@@ -0,0 +1,3 @@
|
||||
<svg width="58" height="64" viewBox="0 0 58 64" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M13.2057 63.8823C11.7151 63.8823 10.5122 63.3279 9.72785 62.2795C8.92161 61.2025 8.71142 59.6988 9.15088 58.1551L10.9156 51.9541C11.0564 51.4604 10.9584 50.929 10.6515 50.5183C10.3446 50.1084 9.86437 49.8671 9.35433 49.8671H4.28022C2.78876 49.8671 1.58594 49.3134 0.801951 48.2652C-0.0042886 47.1873 -0.214478 45.6836 0.225311 44.1399L6.28799 22.9338L6.95738 20.6039C7.85718 17.4423 11.1387 14.8727 14.2736 14.8727H20.3451C21.0696 14.8727 21.7069 14.3893 21.9064 13.6878L23.9142 6.63431C24.8131 3.47585 28.0948 0.906297 31.2296 0.906297L44.2144 0.883142L53.7201 0.882324C55.2111 0.882324 56.414 1.43606 57.1979 2.4843C58.0042 3.56134 58.2144 5.06506 57.7749 6.6087L55.0568 16.1591C54.1582 19.3168 50.8766 21.8855 47.7418 21.8855L34.7281 21.9103H28.6587C27.935 21.9103 27.2985 22.3929 27.0978 23.0936L22.0372 40.7818C21.8956 41.2763 21.9935 41.8093 22.3013 42.2199C22.6082 42.6298 23.0884 42.8711 23.5981 42.8711C23.5993 42.8711 32.1976 42.8543 32.1976 42.8543H41.6751C43.1662 42.8543 44.369 43.4081 45.153 44.4563C45.9592 45.5342 46.1694 47.0379 45.7296 48.5815L43.0116 58.1303C42.113 61.2888 38.8313 63.8575 35.6965 63.8575L22.6832 63.8823H13.2057Z" fill="#F0FF41" style="fill:#F0FF41;fill:color(display-p3 0.9412 1.0000 0.2549);fill-opacity:1;"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
6
apps/hub/public/brand/comfy-hub-logo.svg
Normal file
|
After Width: | Height: | Size: 9.2 KiB |
9
apps/hub/public/brand/comfy-logo-mono.svg
Normal file
@@ -0,0 +1,9 @@
|
||||
<svg width="520" height="520" viewBox="0 0 520 520" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<mask id="mask0_227_285" style="mask-type:alpha" maskUnits="userSpaceOnUse" x="0" y="0" width="520" height="520">
|
||||
<path d="M0 184.335C0 119.812 0 87.5502 12.5571 62.9055C23.6026 41.2274 41.2274 23.6026 62.9055 12.5571C87.5502 0 119.812 0 184.335 0H335.665C400.188 0 432.45 0 457.094 12.5571C478.773 23.6026 496.397 41.2274 507.443 62.9055C520 87.5502 520 119.812 520 184.335V335.665C520 400.188 520 432.45 507.443 457.094C496.397 478.773 478.773 496.397 457.094 507.443C432.45 520 400.188 520 335.665 520H184.335C119.812 520 87.5502 520 62.9055 507.443C41.2274 496.397 23.6026 478.773 12.5571 457.094C0 432.45 0 400.188 0 335.665V184.335Z" fill="#FFFFFF"/>
|
||||
</mask>
|
||||
<g mask="url(#mask0_227_285)">
|
||||
<rect y="0.751831" width="520" height="520" fill="#000000"/>
|
||||
<path d="M176.484 428.831C168.649 428.831 162.327 425.919 158.204 420.412C153.966 414.755 152.861 406.857 155.171 398.749L164.447 366.178C165.187 363.585 164.672 360.794 163.059 358.636C161.446 356.483 158.921 355.216 156.241 355.216H129.571C121.731 355.216 115.409 352.308 111.289 346.802C107.051 341.14 105.946 333.242 108.258 325.134L140.124 213.748L143.642 201.51C148.371 184.904 165.62 171.407 182.097 171.407H214.009C217.817 171.407 221.167 168.868 222.215 165.183L232.769 128.135C237.494 111.545 254.742 98.048 271.219 98.048L339.468 97.9264L389.431 97.9221C397.268 97.9221 403.59 100.831 407.711 106.337C411.949 111.994 413.054 119.892 410.744 128L396.457 178.164C391.734 194.75 374.485 208.242 358.009 208.242L289.607 208.372H257.706C253.902 208.372 250.557 210.907 249.502 214.588L222.903 307.495C222.159 310.093 222.673 312.892 224.291 315.049C225.904 317.202 228.428 318.469 231.107 318.469C231.113 318.469 276.307 318.381 276.307 318.381H326.122C333.959 318.381 340.281 321.29 344.402 326.796C348.639 332.457 349.744 340.355 347.433 348.463L333.146 398.619C328.423 415.209 311.174 428.701 294.698 428.701L226.299 428.831H176.484Z" fill="#FFFFFF"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 2.0 KiB |
3
apps/hub/public/brand/comfy-wordmark-yellow.svg
Normal file
|
After Width: | Height: | Size: 8.4 KiB |
@@ -0,0 +1,29 @@
|
||||
Display Name,Brief Description of your profile,Profile Picture,Respondent,"Social Links ",Submission time,User ID
|
||||
Ingi Erlingsson,Ingi is an Emmy-nominated creative entrepreneur. CEO and co-founder of Systms. Previously: Founder @ Golden Wolf (acquired by Doodles in 2023),Comfy%20Hub%20Creator%20Page%20Info/Ingi%20Erlingsson/magnifics_upscale-precision-CJ9fIMF24eL6WHDIlN4r-NANO_00003__(1).png,Anonymous,"www.instgram.com/ingi.ai
|
||||
www.x.com/ingi_erlingsson
|
||||
www.x.com/thesystms","March 8, 2026 4:16 AM",ingi
|
||||
Purz,"Purz is a creative technologist exploring the intersection of generative AI, art, and open-source tools. Working within the ComfyUI ecosystem, he focuses on building, testing, and teaching creative workflows that help artists and developers experiment with new forms of generative media. His work centers on making powerful tools approachable—and fun to explore.",Comfy%20Hub%20Creator%20Page%20Info/Purz/purz-bio.jpg,Anonymous,"https://x.com/purzbeats
|
||||
https://www.youtube.com/purzbeats
|
||||
https://www.instagram.com/purzbeats
|
||||
https://linkedin.com/in/purzbeats
|
||||
https://purz.ai/
|
||||
https://www.threads.com/@purzbeats","March 8, 2026 5:14 PM",Purz
|
||||
Sirolim,"Sirolim is a Creative Technologist building templates for ComfyUI. When he's not working on templates or movies, he loves agentic engineering.",Comfy%20Hub%20Creator%20Page%20Info/Sirolim/sun.png,Anonymous,"https://www.linkedin.com/in/peterschwarz0001/
|
||||
https://x.com/siro_lim","March 8, 2026 6:49 PM",sirolim
|
||||
Julien Durand | MJM,"I am Julien Durand, an AI Artist and Digital Creator based in Los Angeles and one of the In-House Creatives at ComfyUI. I use ComfyUI as the core of my creative process, building custom workflows to generate and enhance images and videos. With a background in mechanical engineering, I approach generative art with a systems mindset, experimenting with nodes, pipelines and automation to push visual quality and creative possibilities. I enjoy exploring new techniques, optimizing workflows and sharing what I learn with the ComfyUI community.",Comfy%20Hub%20Creator%20Page%20Info/Julien%20Durand%20MJM/New_Logo_16MB.png,Anonymous,"https://www.instagram.com/julienaiart
|
||||
https://x.com/JulienAIArt
|
||||
https://www.youtube.com/@julienaiart
|
||||
https://www.facebook.com/julienaiart
|
||||
https://www.tiktok.com/@midjourney.man
|
||||
https://www.threads.net/@julienaiart
|
||||
https://www.linkedin.com/in/juliendurand805
|
||||
https://julienai.art","March 8, 2026 7:32 PM",julienaiart
|
||||
shanef3d,digital artist and creative director using 3D + AI,Comfy%20Hub%20Creator%20Page%20Info/shanef3d/SOCIAL_PFP.jpg,Anonymous,"http://www.instgram.com/hanef3d
|
||||
https://shanef3d.com","March 9, 2026 7:30 AM",shane
|
||||
hellorob,Easy to use workflows for content used by brands and creators,Comfy%20Hub%20Creator%20Page%20Info/hellorob/twitter_pfp.png,Anonymous,https://x.com/hellorob,"March 9, 2026 8:18 AM",hellorob
|
||||
ComfyUI,Official ComfyUI workflow templates created by the Comfy team.,Comfy%20Hub%20Creator%20Page%20Info/ComfyUI/App_logo_-_C_(Blue).png,Anonymous,"https://x.com/ComfyUI
|
||||
https://www.instagram.com/comfyui/
|
||||
https://www.linkedin.com/company/comfyui/","March 9, 2026 10:21 AM",ComfyUI
|
||||
enigmatic_e,I am a videographer and content creator who also enjoys messing around with VFX and AI. I build AI-Driven Video Solutions with workflows and new tools.,Comfy%20Hub%20Creator%20Page%20Info/enigmatic_e/2J9M3WmW_400x400.jpg,Anonymous,"https://www.youtube.com/@enigmatic_e
|
||||
https://www.instagram.com/enigmatic_e/
|
||||
https://x.com/8bit_e","March 9, 2026 11:54 AM",enigmatic_e
|
||||
|
9
apps/hub/public/favicon.svg
Normal file
@@ -0,0 +1,9 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 128 128">
|
||||
<path d="M50.4 78.5a75.1 75.1 0 0 0-28.5 6.9l24.2-65.7c.7-2 1.9-3.2 3.4-3.2h29c1.5 0 2.7 1.2 3.4 3.2l24.2 65.7s-11.6-7-28.5-7L67 45.5c-.4-1.7-1.6-2.8-2.9-2.8-1.3 0-2.5 1.1-2.9 2.7L50.4 78.5Zm-1.1 28.2Zm-4.2-20.2c-2 6.6-.6 15.8 4.2 20.2a17.5 17.5 0 0 1 .2-.7 5.5 5.5 0 0 1 5.7-4.5c2.8.1 4.3 1.5 4.7 4.7.2 1.1.2 2.3.2 3.5v.4c0 2.7.7 5.2 2.2 7.4a13 13 0 0 0 5.7 4.9v-.3l-.2-.3c-1.8-5.6-.5-9.5 4.4-12.8l1.5-1a73 73 0 0 0 3.2-2.2 16 16 0 0 0 6.8-11.4c.3-2 .1-4-.6-6l-.8.6-1.6 1a37 37 0 0 1-22.4 2.7c-5-.7-9.7-2-13.2-6.2Z" />
|
||||
<style>
|
||||
path { fill: #000; }
|
||||
@media (prefers-color-scheme: dark) {
|
||||
path { fill: #FFF; }
|
||||
}
|
||||
</style>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 749 B |
BIN
apps/hub/public/logos/bfl.png
Normal file
|
After Width: | Height: | Size: 2.9 KiB |
BIN
apps/hub/public/logos/bria.png
Normal file
|
After Width: | Height: | Size: 4.3 KiB |
BIN
apps/hub/public/logos/bytedance.png
Normal file
|
After Width: | Height: | Size: 1.4 KiB |