Add tests for delete & update

refactor, adding tests
Add sqlite db
2026-04-16 04:31:24 +00:00 · 2025-02-21 17:54:14 +00:00 · 2025-02-16 17:22:48 +00:00 · 2025-01-30 21:48:53 +00:00
435 changed files with 274448 additions and 227282 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -63,12 +63,7 @@ except:
 print("checking out master branch")  # noqa: T201
 branch = repo.lookup_branch('master')
 if branch is None:
-    try:
-        ref = repo.lookup_reference('refs/remotes/origin/master')
-    except:
-        print("pulling.")  # noqa: T201
-        pull(repo)
-        ref = repo.lookup_reference('refs/remotes/origin/master')
+    ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
    if branch is None:
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@@ -4,9 +4,6 @@ if you have a NVIDIA gpu:

 run_nvidia_gpu.bat

-if you want to enable the fast fp16 accumulation (faster for fp16 models with slightly less quality):
-
-run_nvidia_gpu_fast_fp16_accumulation.bat


 To run it in slow CPU mode:
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
-pause
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,2 @@
 /web/assets/** linguist-generated
 /web/** linguist-vendored
-comfy_api_nodes/apis/__init__.py linguist-generated
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -15,14 +15,6 @@ body:
        steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.

        If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
-  - type: checkboxes
-    id: custom-nodes-test
-    attributes:
-      label: Custom Node Testing
-      description: Please confirm you have tried to reproduce the issue with all custom nodes disabled.
-      options:
-        - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help)
-          required: false
  - type: textarea
    attributes:
      label: Expected Behavior
--- a/.github/ISSUE_TEMPLATE/user-support.yml
+++ b/.github/ISSUE_TEMPLATE/user-support.yml
@@ -11,14 +11,6 @@ body:
            **2:** You have made an effort to find public answers to your question before asking here. In other words, you googled it first, and scrolled through recent help topics.

                If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
-    - type: checkboxes
-      id: custom-nodes-test
-      attributes:
-        label: Custom Node Testing
-        description: Please confirm you have tried to reproduce the issue with all custom nodes disabled.
-        options:
-          - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help)
-            required: false
    - type: textarea
      attributes:
            label: Your question
--- a/.github/workflows/check-line-endings.yml
+++ b/.github/workflows/check-line-endings.yml
@@ -1,40 +0,0 @@
-name: Check for Windows Line Endings
-
-on:
-  pull_request:
-    branches: ['*'] # Trigger on all pull requests to any branch
-
-jobs:
-  check-line-endings:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # Fetch all history to compare changes
-
-      - name: Check for Windows line endings (CRLF)
-        run: |
-          # Get the list of changed files in the PR
-          CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
-
-          # Flag to track if CRLF is found
-          CRLF_FOUND=false
-
-          # Loop through each changed file
-          for FILE in $CHANGED_FILES; do
-            # Check if the file exists and is a text file
-            if [ -f "$FILE" ] && file "$FILE" | grep -q "text"; then
-              # Check for CRLF line endings
-              if grep -UP '\r$' "$FILE"; then
-                echo "Error: Windows line endings (CRLF) detected in $FILE"
-                CRLF_FOUND=true
-              fi
-            fi
-          done
-
-          # Exit with error if CRLF was found
-          if [ "$CRLF_FOUND" = true ]; then
-            exit 1
-          fi
--- a/.github/workflows/release-webhook.yml
+++ b/.github/workflows/release-webhook.yml
@@ -1,108 +0,0 @@
-name: Release Webhook
-
-on:
-  release:
-    types: [published]
-
-jobs:
-  send-webhook:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Send release webhook
-        env:
-          WEBHOOK_URL: ${{ secrets.RELEASE_GITHUB_WEBHOOK_URL }}
-          WEBHOOK_SECRET: ${{ secrets.RELEASE_GITHUB_WEBHOOK_SECRET }}
-        run: |
-          # Generate UUID for delivery ID
-          DELIVERY_ID=$(uuidgen)
-          HOOK_ID="release-webhook-$(date +%s)"
-          
-          # Create webhook payload matching GitHub release webhook format
-          PAYLOAD=$(cat <<EOF
-          {
-            "action": "published",
-            "release": {
-              "id": ${{ github.event.release.id }},
-              "node_id": "${{ github.event.release.node_id }}",
-              "url": "${{ github.event.release.url }}",
-              "html_url": "${{ github.event.release.html_url }}",
-              "assets_url": "${{ github.event.release.assets_url }}",
-              "upload_url": "${{ github.event.release.upload_url }}",
-              "tag_name": "${{ github.event.release.tag_name }}",
-              "target_commitish": "${{ github.event.release.target_commitish }}",
-              "name": ${{ toJSON(github.event.release.name) }},
-              "body": ${{ toJSON(github.event.release.body) }},
-              "draft": ${{ github.event.release.draft }},
-              "prerelease": ${{ github.event.release.prerelease }},
-              "created_at": "${{ github.event.release.created_at }}",
-              "published_at": "${{ github.event.release.published_at }}",
-              "author": {
-                "login": "${{ github.event.release.author.login }}",
-                "id": ${{ github.event.release.author.id }},
-                "node_id": "${{ github.event.release.author.node_id }}",
-                "avatar_url": "${{ github.event.release.author.avatar_url }}",
-                "url": "${{ github.event.release.author.url }}",
-                "html_url": "${{ github.event.release.author.html_url }}",
-                "type": "${{ github.event.release.author.type }}",
-                "site_admin": ${{ github.event.release.author.site_admin }}
-              },
-              "tarball_url": "${{ github.event.release.tarball_url }}",
-              "zipball_url": "${{ github.event.release.zipball_url }}",
-              "assets": ${{ toJSON(github.event.release.assets) }}
-            },
-            "repository": {
-              "id": ${{ github.event.repository.id }},
-              "node_id": "${{ github.event.repository.node_id }}",
-              "name": "${{ github.event.repository.name }}",
-              "full_name": "${{ github.event.repository.full_name }}",
-              "private": ${{ github.event.repository.private }},
-              "owner": {
-                "login": "${{ github.event.repository.owner.login }}",
-                "id": ${{ github.event.repository.owner.id }},
-                "node_id": "${{ github.event.repository.owner.node_id }}",
-                "avatar_url": "${{ github.event.repository.owner.avatar_url }}",
-                "url": "${{ github.event.repository.owner.url }}",
-                "html_url": "${{ github.event.repository.owner.html_url }}",
-                "type": "${{ github.event.repository.owner.type }}",
-                "site_admin": ${{ github.event.repository.owner.site_admin }}
-              },
-              "html_url": "${{ github.event.repository.html_url }}",
-              "clone_url": "${{ github.event.repository.clone_url }}",
-              "git_url": "${{ github.event.repository.git_url }}",
-              "ssh_url": "${{ github.event.repository.ssh_url }}",
-              "url": "${{ github.event.repository.url }}",
-              "created_at": "${{ github.event.repository.created_at }}",
-              "updated_at": "${{ github.event.repository.updated_at }}",
-              "pushed_at": "${{ github.event.repository.pushed_at }}",
-              "default_branch": "${{ github.event.repository.default_branch }}",
-              "fork": ${{ github.event.repository.fork }}
-            },
-            "sender": {
-              "login": "${{ github.event.sender.login }}",
-              "id": ${{ github.event.sender.id }},
-              "node_id": "${{ github.event.sender.node_id }}",
-              "avatar_url": "${{ github.event.sender.avatar_url }}",
-              "url": "${{ github.event.sender.url }}",
-              "html_url": "${{ github.event.sender.html_url }}",
-              "type": "${{ github.event.sender.type }}",
-              "site_admin": ${{ github.event.sender.site_admin }}
-            }
-          }
-          EOF
-          )
-          
-          # Generate HMAC-SHA256 signature
-          SIGNATURE=$(echo -n "$PAYLOAD" | openssl dgst -sha256 -hmac "$WEBHOOK_SECRET" -hex | cut -d' ' -f2)
-          
-          # Send webhook with required headers
-          curl -X POST "$WEBHOOK_URL" \
-            -H "Content-Type: application/json" \
-            -H "X-GitHub-Event: release" \
-            -H "X-GitHub-Delivery: $DELIVERY_ID" \
-            -H "X-GitHub-Hook-ID: $HOOK_ID" \
-            -H "X-Hub-Signature-256: sha256=$SIGNATURE" \
-            -H "User-Agent: GitHub-Actions-Webhook/1.0" \
-            -d "$PAYLOAD" \
-            --fail --silent --show-error
-          
-          echo "✅ Release webhook sent successfully"
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -12,17 +12,17 @@ on:
        description: 'CUDA version'
        required: true
        type: string
-        default: "129"
+        default: "124"
      python_minor:
        description: 'Python minor version'
        required: true
        type: string
-        default: "13"
+        default: "12"
      python_patch:
        description: 'Python patch version'
        required: true
        type: string
-        default: "6"
+        default: "8"


 jobs:
@@ -36,7 +36,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.git_tag }}
-          fetch-depth: 150
+          fetch-depth: 0
          persist-credentials: false
      - uses: actions/cache/restore@v4
        id: cache
@@ -66,16 +66,11 @@ jobs:
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
-          sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
-
-          rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
-          rm ./Lib/site-packages/torch/lib/libprotoc.lib
-          rm ./Lib/site-packages/torch/lib/libprotobuf.lib
-
-          cd ..
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
+            cd ..

          git clone --depth 1 https://github.com/comfyanonymous/taesd
-          cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

          mkdir ComfyUI_windows_portable
          mv python_embeded ComfyUI_windows_portable
@@ -90,14 +85,12 @@ jobs:

          cd ..

-          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z

          cd ComfyUI_windows_portable
          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

-          python_embeded/python.exe -s ./update/update.py ComfyUI/
-
          ls

      - name: Upload binaries to release
@@ -107,4 +100,5 @@ jobs:
          file: ComfyUI_windows_portable_nvidia.7z
          tag: ${{ inputs.git_tag }}
          overwrite: true
-          draft: true
+          prerelease: true
+          make_latest: false
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
@@ -28,4 +28,4 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -r requirements.txt
--- a/.github/workflows/test-execution.yml
+++ b/.github/workflows/test-execution.yml
@@ -1,30 +0,0 @@
-name: Execution Tests
-
-on:
-  push:
-    branches: [ main, master ]
-  pull_request:
-    branches: [ main, master ]
-
-jobs:
-  test:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-    runs-on: ${{ matrix.os }}
-    continue-on-error: true
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python      
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-        pip install -r tests-unit/requirements.txt
-    - name: Run Execution Tests
-      run: |
-        python -m pytest tests/execution -v --skip-timing-checks
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -17,7 +17,7 @@ jobs:
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
-        python-version: '3.10'
+        python-version: '3.9'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@@ -18,7 +18,7 @@ jobs:
    - name: Set up Python      
      uses: actions/setup-python@v4
      with:
-        python-version: '3.12'
+        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/update-api-stubs.yml
+++ b/.github/workflows/update-api-stubs.yml
@@ -1,56 +0,0 @@
-name: Generate Pydantic Stubs from api.comfy.org
-
-on:
-  schedule:
-    - cron: '0 0 * * 1'
-  workflow_dispatch:
-
-jobs:
-  generate-models:
-    runs-on: ubuntu-latest
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'datamodel-code-generator[http]'
-          npm install @redocly/cli
-      
-      - name: Download OpenAPI spec
-        run: |
-          curl -o openapi.yaml https://api.comfy.org/openapi
-      
-      - name: Filter OpenAPI spec with Redocly
-        run: |
-          npx @redocly/cli bundle openapi.yaml --output filtered-openapi.yaml --config comfy_api_nodes/redocly.yaml --remove-unused-components
-      
-      - name: Generate API models
-        run: |
-          datamodel-codegen --use-subclass-enum --input filtered-openapi.yaml --output comfy_api_nodes/apis --output-model-type pydantic_v2.BaseModel
-      
-      - name: Check for changes
-        id: git-check
-        run: |
-          git diff --exit-code comfy_api_nodes/apis || echo "changes=true" >> $GITHUB_OUTPUT
-      
-      - name: Create Pull Request
-        if: steps.git-check.outputs.changes == 'true'
-        uses: peter-evans/create-pull-request@v5
-        with:
-          commit-message: 'chore: update API models from OpenAPI spec'
-          title: 'Update API models from api.comfy.org'
-          body: |
-            This PR updates the API models based on the latest api.comfy.org OpenAPI specification.
-            
-            Generated automatically by the a Github workflow.
-          branch: update-api-stubs
-          delete-branch: true
-          base: master
--- a/.github/workflows/update-frontend.yml
+++ b/.github/workflows/update-frontend.yml
@@ -0,0 +1,58 @@
+name: Update Frontend Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Frontend version to update to (e.g., 1.0.0)"
+        required: true
+        type: string
+
+jobs:
+  update-frontend:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout ComfyUI
+        uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install requirements
+        run: |
+          python -m pip install --upgrade pip
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install -r requirements.txt
+          pip install wait-for-it
+      # Frontend asset will be downloaded to ComfyUI/web_custom_versions/Comfy-Org_ComfyUI_frontend/{version}
+      - name: Start ComfyUI server
+        run: |
+          python main.py --cpu --front-end-version Comfy-Org/ComfyUI_frontend@${{ github.event.inputs.version }} 2>&1 | tee console_output.log &
+          wait-for-it --service 127.0.0.1:8188 -t 30
+      - name: Configure Git
+        run: |
+          git config --global user.name "GitHub Action"
+          git config --global user.email "action@github.com"
+      # Replace existing frontend content with the new version and remove .js.map files
+      # See https://github.com/Comfy-Org/ComfyUI_frontend/issues/2145 for why we remove .js.map files
+      - name: Update frontend content
+        run: |
+          rm -rf web/
+          cp -r web_custom_versions/Comfy-Org_ComfyUI_frontend/${{ github.event.inputs.version }} web/
+          rm web/**/*.js.map
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.PR_BOT_PAT }}
+          commit-message: "Update frontend to v${{ github.event.inputs.version }}"
+          title: "Frontend Update: v${{ github.event.inputs.version }}"
+          body: |
+            Automated PR to update frontend content to version ${{ github.event.inputs.version }}
+
+            This PR was created automatically by the frontend update workflow.
+          branch: release-${{ github.event.inputs.version }}
+          base: master
+          labels: Frontend,dependencies
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -17,19 +17,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "129"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "13"
+        default: "12"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "6"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "129"
+        default: "126"

      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "5"
+        default: "1"
 #  push:
 #    branches:
 #      - master
@@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 30
+            fetch-depth: 0
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@@ -53,12 +53,10 @@ jobs:
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
-
-            rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
            cd ..

            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable_nightly_pytorch
            mv python_embeded ComfyUI_windows_portable_nightly_pytorch
@@ -76,7 +74,7 @@ jobs:
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z

            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,19 +7,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "129"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "13"
+        default: "12"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "6"
+        default: "8"
 #  push:
 #    branches:
 #      - master
@@ -50,7 +50,7 @@ jobs:

        - uses: actions/checkout@v4
          with:
-            fetch-depth: 150
+            fetch-depth: 0
            persist-credentials: false
        - shell: bash
          run: |
@@ -64,14 +64,10 @@ jobs:
            ./python.exe get-pip.py
            ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
-
-            rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space
-            rm ./Lib/site-packages/torch/lib/libprotoc.lib
-            rm ./Lib/site-packages/torch/lib/libprotobuf.lib
            cd ..

            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable
            mv python_embeded ComfyUI_windows_portable
@@ -86,14 +82,12 @@ jobs:

            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z

            cd ComfyUI_windows_portable
            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

-            python_embeded/python.exe -s ./update/update.py ComfyUI/
-
            ls

        - name: Upload binaries to release
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,3 @@ venv/
 *.log
 web_custom_versions/
 .DS_Store
-openapi.yaml
-filtered-openapi.yaml
-uv.lock
--- a/28
+++ b/28
@@ -5,21 +5,19 @@
 # Inlined the team members for now.

 # Maintainers
-*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
-/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink

 # Python web server
-/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
-/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
-/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata

-# Node developers
-/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
-/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
-/comfy_api_nodes/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill
+# Frontend assets
+/web/ @huchenlei @webfiltered @pythongosssss @yoland68 @robinjhuang
+
+# Extra nodes
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink
--- a/README.md
+++ b/README.md
@@ -1,12 +1,11 @@
 <div align="center">

 # ComfyUI
-**The most powerful and modular visual AI engine and application.**
+**The most powerful and modular diffusion model GUI and backend.**


 [![Website][website-shield]][website-url]
 [![Dynamic JSON Badge][discord-shield]][discord-url]
-[![Twitter][twitter-shield]][twitter-url]
 [![Matrix][matrix-shield]][matrix-url]
 <br>
 [![][github-release-shield]][github-release-link]
@@ -21,8 +20,6 @@
 <!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
 [discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
 [discord-url]: https://www.comfy.org/discord
-[twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
-[twitter-url]: https://x.com/ComfyUI

 [github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
 [github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
@@ -34,28 +31,15 @@
 ![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
 </div>

-ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
+This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
+### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)

-## Get Started
-
-#### [Desktop Application](https://www.comfy.org/download)
- The easiest way to get started.
- Available on Windows & macOS.
-
-#### [Windows Portable Package](#installing)
- Get the latest commits and completely portable.
- Available on Windows.
-
-#### [Manual Install](#manual-install-windows-linux)
-Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
-
-## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
-See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
+### [Installing ComfyUI](#installing)

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Image Models
-   - SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/))
+   - SD1.x, SD2.x,
   - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
   - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
   - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
@@ -63,32 +47,17 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
-   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
-   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
-   - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
- Image Editing Models
-   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
-   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
-   - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
-   - [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
 - Video Models
   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
-   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
-   - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
- Audio Models
-   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
-   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- 3D Models
-   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
+- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Smart memory management: can automatically run large models on GPUs with as low as 1GB vram with smart offloading.
+- Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
 - Works even if you don't have a GPU with: ```--cpu``` (slow)
- Can load ckpt and safetensors: All in one checkpoints or standalone diffusion models, VAEs and CLIP models.
- Safe loading of ckpt, pt, pth, etc.. files.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models.
 - Embeddings/Textual inversion
 - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
 - [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/)
@@ -99,32 +68,17 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 - [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models.
 - [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/)
 - [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/)
+- [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
- Works fully offline: core will never download anything unless you want to.
- Optional API nodes to use paid models from external providers through the online [Comfy API](https://docs.comfy.org/tutorials/api-nodes/overview).
+- Starts up very fast.
+- Works fully offline: will never download anything.
 - [Config file](extra_model_paths.yaml.example) to set the search paths for models.

 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)

-## Release Process
-
-ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
-
-1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
-   - Releases a new stable version (e.g., v0.7.0)
-   - Serves as the foundation for the desktop release
-
-2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
-   - Builds a new release using the latest stable core version
-
-3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
-   - Weekly frontend updates are merged into the core repository
-   - Features are frozen for the upcoming core release
-   - Development continues for the next release cycle
-
 ## Shortcuts

 | Keybind                            | Explanation                                                                                                        |
@@ -165,7 +119,7 @@ ComfyUI follows a weekly release cycle targeting Friday but this regularly chang

 # Installing

-## Windows Portable
+## Windows

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

@@ -179,18 +133,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.

+## Jupyter Notebook

-## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
-
-You can install and start ComfyUI using comfy-cli:
-```bash
-pip install comfy-cli
-comfy install
-```
+To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)

 ## Manual Install (Windows, Linux)

-Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12
+Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.

 Git clone this repo.

@@ -202,37 +151,45 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```

-This is the command to install the nightly with ROCm 6.4 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4```

 ### Intel GPUs (Windows and Linux)

-(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
-
-1. To install PyTorch xpu, use the following command:
-
-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu```
-
-This is the command to install the Pytorch xpu nightly which might have some performance improvements:
+(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+  
+1. To install PyTorch nightly, use the following command:

 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```

+2. Launch ComfyUI by running `python main.py`
+
+
 (Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.

-1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
+1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below:
+
+```
+conda install libuv
+pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+```
+
+For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
+
+Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).

 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124```

-This is the command to install pytorch nightly instead which might have performance improvements.
+This is the command to install pytorch nightly instead which might have performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126```

 #### Troubleshooting

@@ -265,8 +222,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 #### DirectML (AMD Cards on Windows)

-This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out.
-
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

 #### Ascend NPUs
@@ -278,20 +233,6 @@ For models compatible with Ascend Extension for PyTorch (torch_npu). To get star
 3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
 4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.

-#### Cambricon MLUs
-
-For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
-
-1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
-2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
-3. Launch ComfyUI by running `python main.py`
-
-#### Iluvatar Corex
-
-For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method:
-
-1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
-2. Launch ComfyUI by running `python main.py`

 # Running

@@ -307,7 +248,7 @@ For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 pyt

 ### AMD ROCm Tips

-You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.
+You can enable experimental memory efficient attention on pytorch 2.5 in ComfyUI on RDNA3 and potentially other AMD GPUs using this command:

 ```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```

@@ -343,13 +284,11 @@ Generate a self-signed certificate (not appropriate for shared/production use) a

 Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app will now be accessible with `https://...` instead of `http://...`.

-> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above.
+> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above. 
 <br/><br/>If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal.

 ## Support and dev channel

-[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
-
 [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).

 See also: [https://www.comfy.org/](https://www.comfy.org/)
@@ -366,7 +305,7 @@ For any bugs, issues, or feature requests related to the frontend, please use th

 The new frontend is now the default for ComfyUI. However, please note:

-1. The frontend in the main ComfyUI repository is updated fortnightly.
+1. The frontend in the main ComfyUI repository is updated weekly.
 2. Daily releases are available in the separate frontend repository.

 To use the most up-to-date frontend version:
@@ -383,7 +322,7 @@ To use the most up-to-date frontend version:
   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
   ```

-This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
+This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.

 ### Accessing the Legacy Frontend

--- a/alembic.ini
+++ b/alembic.ini
@@ -1,84 +0,0 @@
-# A generic, single database configuration.
-
-[alembic]
-# path to migration scripts
-# Use forward slashes (/) also on windows to provide an os agnostic path
-script_location = alembic_db
-
-# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
-# Uncomment the line below if you want the files to be prepended with date and time
-# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
-# for all available tokens
-# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
-
-# sys.path path, will be prepended to sys.path if present.
-# defaults to the current working directory.
-prepend_sys_path = .
-
-# timezone to use when rendering the date within the migration file
-# as well as the filename.
-# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
-# Any required deps can installed by adding `alembic[tz]` to the pip requirements
-# string value is passed to ZoneInfo()
-# leave blank for localtime
-# timezone =
-
-# max length of characters to apply to the "slug" field
-# truncate_slug_length = 40
-
-# set to 'true' to run the environment during
-# the 'revision' command, regardless of autogenerate
-# revision_environment = false
-
-# set to 'true' to allow .pyc and .pyo files without
-# a source .py file to be detected as revisions in the
-# versions/ directory
-# sourceless = false
-
-# version location specification; This defaults
-# to alembic_db/versions.  When using multiple version
-# directories, initial revisions must be specified with --version-path.
-# The path separator used here should be the separator specified by "version_path_separator" below.
-# version_locations = %(here)s/bar:%(here)s/bat:alembic_db/versions
-
-# version path separator; As mentioned above, this is the character used to split
-# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
-# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
-# Valid values for version_path_separator are:
-#
-# version_path_separator = :
-# version_path_separator = ;
-# version_path_separator = space
-# version_path_separator = newline
-#
-# Use os.pathsep. Default configuration used for new projects.
-version_path_separator = os
-
-# set to 'true' to search source files recursively
-# in each "version_locations" directory
-# new in Alembic version 1.10
-# recursive_version_locations = false
-
-# the output encoding used when revision files
-# are written from script.py.mako
-# output_encoding = utf-8
-
-sqlalchemy.url = sqlite:///user/comfyui.db
-
-
-[post_write_hooks]
-# post_write_hooks defines scripts or Python functions that are run
-# on newly generated revision scripts.  See the documentation for further
-# detail and examples
-
-# format using "black" - use the console_scripts runner, against the "black" entrypoint
-# hooks = black
-# black.type = console_scripts
-# black.entrypoint = black
-# black.options = -l 79 REVISION_SCRIPT_FILENAME
-
-# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
-# hooks = ruff
-# ruff.type = exec
-# ruff.executable = %(here)s/.venv/bin/ruff
-# ruff.options = check --fix REVISION_SCRIPT_FILENAME
--- a/alembic_db/README.md
+++ b/alembic_db/README.md
@@ -1,4 +0,0 @@
-## Generate new revision
-
-1. Update models in `/app/database/models.py`
-2. Run `alembic revision --autogenerate -m "{your message}"`
--- a/alembic_db/env.py
+++ b/alembic_db/env.py
@@ -1,64 +0,0 @@
-from sqlalchemy import engine_from_config
-from sqlalchemy import pool
-
-from alembic import context
-
-# this is the Alembic Config object, which provides
-# access to the values within the .ini file in use.
-config = context.config
-
-
-from app.database.models import Base
-target_metadata = Base.metadata
-
-# other values from the config, defined by the needs of env.py,
-# can be acquired:
-# my_important_option = config.get_main_option("my_important_option")
-# ... etc.
-
-
-def run_migrations_offline() -> None:
-    """Run migrations in 'offline' mode.
-    This configures the context with just a URL
-    and not an Engine, though an Engine is acceptable
-    here as well.  By skipping the Engine creation
-    we don't even need a DBAPI to be available.
-    Calls to context.execute() here emit the given string to the
-    script output.
-    """
-    url = config.get_main_option("sqlalchemy.url")
-    context.configure(
-        url=url,
-        target_metadata=target_metadata,
-        literal_binds=True,
-        dialect_opts={"paramstyle": "named"},
-    )
-
-    with context.begin_transaction():
-        context.run_migrations()
-
-
-def run_migrations_online() -> None:
-    """Run migrations in 'online' mode.
-    In this scenario we need to create an Engine
-    and associate a connection with the context.
-    """
-    connectable = engine_from_config(
-        config.get_section(config.config_ini_section, {}),
-        prefix="sqlalchemy.",
-        poolclass=pool.NullPool,
-    )
-
-    with connectable.connect() as connection:
-        context.configure(
-            connection=connection, target_metadata=target_metadata
-        )
-
-        with context.begin_transaction():
-            context.run_migrations()
-
-
-if context.is_offline_mode():
-    run_migrations_offline()
-else:
-    run_migrations_online()
--- a/alembic_db/script.py.mako
+++ b/alembic_db/script.py.mako
@@ -1,28 +0,0 @@
-"""${message}
-
-Revision ID: ${up_revision}
-Revises: ${down_revision | comma,n}
-Create Date: ${create_date}
-
-"""
-from typing import Sequence, Union
-
-from alembic import op
-import sqlalchemy as sa
-${imports if imports else ""}
-
-# revision identifiers, used by Alembic.
-revision: str = ${repr(up_revision)}
-down_revision: Union[str, None] = ${repr(down_revision)}
-branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
-depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
-
-
-def upgrade() -> None:
-    """Upgrade schema."""
-    ${upgrades if upgrades else "pass"}
-
-
-def downgrade() -> None:
-    """Downgrade schema."""
-    ${downgrades if downgrades else "pass"}
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -1,9 +1,9 @@
 from aiohttp import web
 from typing import Optional
-from folder_paths import folder_names_and_paths, get_directory_by_type
+from folder_paths import models_dir, user_directory, output_directory, folder_names_and_paths
+from api_server.services.file_service import FileService
 from api_server.services.terminal_service import TerminalService
 import app.logger
-import os

 class InternalRoutes:
    '''
@@ -15,10 +15,26 @@ class InternalRoutes:
    def __init__(self, prompt_server):
        self.routes: web.RouteTableDef = web.RouteTableDef()
        self._app: Optional[web.Application] = None
+        self.file_service = FileService({
+            "models": models_dir,
+            "user": user_directory,
+            "output": output_directory
+        })
        self.prompt_server = prompt_server
        self.terminal_service = TerminalService(prompt_server)

    def setup_routes(self):
+        @self.routes.get('/files')
+        async def list_files(request):
+            directory_key = request.query.get('directory', '')
+            try:
+                file_list = self.file_service.list_files(directory_key)
+                return web.json_response({"files": file_list})
+            except ValueError as e:
+                return web.json_response({"error": str(e)}, status=400)
+            except Exception as e:
+                return web.json_response({"error": str(e)}, status=500)
+
        @self.routes.get('/logs')
        async def get_logs(request):
            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))
@@ -51,20 +67,6 @@ class InternalRoutes:
                response[key] = folder_names_and_paths[key][0]
            return web.json_response(response)

-        @self.routes.get('/files/{directory_type}')
-        async def get_files(request: web.Request) -> web.Response:
-            directory_type = request.match_info['directory_type']
-            if directory_type not in ("output", "input", "temp"):
-                return web.json_response({"error": "Invalid directory type"}, status=400)
-
-            directory = get_directory_by_type(directory_type)
-            sorted_files = sorted(
-                (entry for entry in os.scandir(directory) if entry.is_file()),
-                key=lambda entry: -entry.stat().st_mtime
-            )
-            return web.json_response([entry.name for entry in sorted_files], status=200)
-
-
    def get_app(self):
        if self._app is None:
            self._app = web.Application()
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@@ -0,0 +1,13 @@
+from typing import Dict, List, Optional
+from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
+
+class FileService:
+    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
+        self.allowed_directories: Dict[str, str] = allowed_directories
+        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
+
+    def list_files(self, directory_key: str) -> List[FileSystemItem]:
+        if directory_key not in self.allowed_directories:
+            raise ValueError("Invalid directory key")
+        directory_path: str = self.allowed_directories[directory_key]
+        return self.file_system_ops.walk_directory(directory_path)
--- a/app/app_settings.py
+++ b/app/app_settings.py
@@ -9,14 +9,8 @@ class AppSettings():
        self.user_manager = user_manager

    def get_settings(self, request):
-        try:
-            file = self.user_manager.get_request_user_filepath(
-                request,
-                "comfy.settings.json"
-            )
-        except KeyError as e:
-            logging.error("User settings not found.")
-            raise web.HTTPUnauthorized() from e
+        file = self.user_manager.get_request_user_filepath(
+            request, "comfy.settings.json")
        if os.path.isfile(file):
            try:
                with open(file) as f:
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@@ -4,142 +4,31 @@ import os
 import folder_paths
 import glob
 from aiohttp import web
-import json
-import logging
-from functools import lru_cache
-
-from utils.json_util import merge_json_recursive
-
-
-# Extra locale files to load into main.json
-EXTRA_LOCALE_FILES = [
-    "nodeDefs.json",
-    "commands.json",
-    "settings.json",
-]
-
-
-def safe_load_json_file(file_path: str) -> dict:
-    if not os.path.exists(file_path):
-        return {}
-
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    except json.JSONDecodeError:
-        logging.error(f"Error loading {file_path}")
-        return {}
-

 class CustomNodeManager:
-    @lru_cache(maxsize=1)
-    def build_translations(self):
-        """Load all custom nodes translations during initialization. Translations are
-        expected to be loaded from `locales/` folder.
-
-        The folder structure is expected to be the following:
-        - custom_nodes/
-            - custom_node_1/
-                - locales/
-                    - en/
-                        - main.json
-                        - commands.json
-                        - settings.json
-
-        returned translations are expected to be in the following format:
-        {
-            "en": {
-                "nodeDefs": {...},
-                "commands": {...},
-                "settings": {...},
-                ...{other main.json keys}
-            }
-        }
-        """
-
-        translations = {}
-
-        for folder in folder_paths.get_folder_paths("custom_nodes"):
-            # Sort glob results for deterministic ordering
-            for custom_node_dir in sorted(glob.glob(os.path.join(folder, "*/"))):
-                locales_dir = os.path.join(custom_node_dir, "locales")
-                if not os.path.exists(locales_dir):
-                    continue
-
-                for lang_dir in glob.glob(os.path.join(locales_dir, "*/")):
-                    lang_code = os.path.basename(os.path.dirname(lang_dir))
-
-                    if lang_code not in translations:
-                        translations[lang_code] = {}
-
-                    # Load main.json
-                    main_file = os.path.join(lang_dir, "main.json")
-                    node_translations = safe_load_json_file(main_file)
-
-                    # Load extra locale files
-                    for extra_file in EXTRA_LOCALE_FILES:
-                        extra_file_path = os.path.join(lang_dir, extra_file)
-                        key = extra_file.split(".")[0]
-                        json_data = safe_load_json_file(extra_file_path)
-                        if json_data:
-                            node_translations[key] = json_data
-
-                    if node_translations:
-                        translations[lang_code] = merge_json_recursive(
-                            translations[lang_code], node_translations
-                        )
-
-        return translations
-
+    """
+    Placeholder to refactor the custom node management features from ComfyUI-Manager.
+    Currently it only contains the custom workflow templates feature.
+    """
    def add_routes(self, routes, webapp, loadedModules):

-        example_workflow_folder_names = ["example_workflows", "example", "examples", "workflow", "workflows"]
-
        @routes.get("/workflow_templates")
        async def get_workflow_templates(request):
            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
-
-            files = []
-
-            for folder in folder_paths.get_folder_paths("custom_nodes"):
-                for folder_name in example_workflow_folder_names:
-                    pattern = os.path.join(folder, f"*/{folder_name}/*.json")
-                    matched_files = glob.glob(pattern)
-                    files.extend(matched_files)
-
-            workflow_templates_dict = (
-                {}
-            )  # custom_nodes folder name -> example workflow names
+            files = [
+                file
+                for folder in folder_paths.get_folder_paths("custom_nodes")
+                for file in glob.glob(os.path.join(folder, '*/example_workflows/*.json'))
+            ]
+            workflow_templates_dict = {} # custom_nodes folder name -> example workflow names
            for file in files:
-                custom_nodes_name = os.path.basename(
-                    os.path.dirname(os.path.dirname(file))
-                )
+                custom_nodes_name = os.path.basename(os.path.dirname(os.path.dirname(file)))
                workflow_name = os.path.splitext(os.path.basename(file))[0]
-                workflow_templates_dict.setdefault(custom_nodes_name, []).append(
-                    workflow_name
-                )
+                workflow_templates_dict.setdefault(custom_nodes_name, []).append(workflow_name)
            return web.json_response(workflow_templates_dict)

        # Serve workflow templates from custom nodes.
        for module_name, module_dir in loadedModules:
-            for folder_name in example_workflow_folder_names:
-                workflows_dir = os.path.join(module_dir, folder_name)
-
-                if os.path.exists(workflows_dir):
-                    if folder_name != "example_workflows":
-                        logging.debug(
-                            "Found example workflow folder '%s' for custom node '%s', consider renaming it to 'example_workflows'",
-                            folder_name, module_name)
-
-                    webapp.add_routes(
-                        [
-                            web.static(
-                                "/api/workflow_templates/" + module_name, workflows_dir
-                            )
-                        ]
-                    )
-
-        @routes.get("/i18n")
-        async def get_i18n(request):
-            """Returns translations from all custom nodes' locales folders."""
-            return web.json_response(self.build_translations())
+            workflows_dir = os.path.join(module_dir, 'example_workflows')
+            if os.path.exists(workflows_dir):
+                webapp.add_routes([web.static('/api/workflow_templates/' + module_name, workflows_dir)])
--- a/comfy_api_nodes/init.py
+++ b/comfy_api_nodes/init.py
--- a/app/database/db.py
+++ b/app/database/db.py
@@ -1,112 +1,126 @@
 import logging
 import os
-import shutil
-from app.logger import log_startup_warning
-from utils.install_util import get_missing_requirements_message
+import sqlite3
+from contextlib import contextmanager
+from queue import Queue, Empty, Full
+import threading
+from app.database.updater import DatabaseUpdater
+import folder_paths
 from comfy.cli_args import args

-_DB_AVAILABLE = False
-Session = None

-
-try:
-    from alembic import command
-    from alembic.config import Config
-    from alembic.runtime.migration import MigrationContext
-    from alembic.script import ScriptDirectory
-    from sqlalchemy import create_engine
-    from sqlalchemy.orm import sessionmaker
-
-    _DB_AVAILABLE = True
-except ImportError as e:
-    log_startup_warning(
-        f"""
------------------------------------------------------------------------
-Error importing dependencies: {e}
-{get_missing_requirements_message()}
-This error is happening because ComfyUI now uses a local sqlite database.
------------------------------------------------------------------------
-""".strip()
-    )
-
-
-def dependencies_available():
-    """
-    Temporary function to check if the dependencies are available
-    """
-    return _DB_AVAILABLE
-
-
-def can_create_session():
-    """
-    Temporary function to check if the database is available to create a session
-    During initial release there may be environmental issues (or missing dependencies) that prevent the database from being created
-    """
-    return dependencies_available() and Session is not None
-
-
-def get_alembic_config():
-    root_path = os.path.join(os.path.dirname(__file__), "../..")
-    config_path = os.path.abspath(os.path.join(root_path, "alembic.ini"))
-    scripts_path = os.path.abspath(os.path.join(root_path, "alembic_db"))
-
-    config = Config(config_path)
-    config.set_main_option("script_location", scripts_path)
-    config.set_main_option("sqlalchemy.url", args.database_url)
-
-    return config
-
-
-def get_db_path():
-    url = args.database_url
-    if url.startswith("sqlite:///"):
-        return url.split("///")[1]
-    else:
-        raise ValueError(f"Unsupported database URL '{url}'.")
-
-
-def init_db():
-    db_url = args.database_url
-    logging.debug(f"Database URL: {db_url}")
-    db_path = get_db_path()
-    db_exists = os.path.exists(db_path)
-
-    config = get_alembic_config()
-
-    # Check if we need to upgrade
-    engine = create_engine(db_url)
-    conn = engine.connect()
-
-    context = MigrationContext.configure(conn)
-    current_rev = context.get_current_revision()
-
-    script = ScriptDirectory.from_config(config)
-    target_rev = script.get_current_head()
-
-    if target_rev is None:
-        logging.warning("No target revision found.")
-    elif current_rev != target_rev:
-        # Backup the database pre upgrade
-        backup_path = db_path + ".bkp"
-        if db_exists:
-            shutil.copy(db_path, backup_path)
+class Database:
+    def __init__(self, database_path=None, pool_size=1):
+        if database_path is None:
+            self.exists = False
+            database_path = "file::memory:?cache=shared"
        else:
-            backup_path = None
+            self.exists = os.path.exists(database_path)
+
+        self.database_path = database_path
+        self.pool_size = pool_size
+        # Store connections in a pool, default to 1 as normal usage is going to be from a single thread at a time
+        self.connection_pool: Queue = Queue(maxsize=pool_size)
+        self._db_lock = threading.Lock()
+        self._initialized = False
+        self._closing = False
+        self._after_update_callbacks = []
+
+    def _setup(self):
+        if self._initialized:
+            return
+
+        with self._db_lock:
+            if not self._initialized:
+                self._make_db()
+                self._initialized = True
+
+    def _create_connection(self):
+        # TODO: Catch error for sqlite lib missing on linux
+        logging.info(f"Creating connection to {self.database_path}")
+        conn = sqlite3.connect(
+            self.database_path,
+            check_same_thread=False,
+            uri=self.database_path.startswith("file::"),
+        )
+        conn.execute("PRAGMA foreign_keys = ON")
+        self.exists = True
+        logging.info(f"Connected!")
+        return conn
+
+    def _make_db(self):
+        with self._get_connection() as con:
+            updater = DatabaseUpdater(con, self.database_path)
+            result = updater.update()
+            if result is not None:
+                old_version, new_version = result
+
+                for callback in self._after_update_callbacks:
+                    callback(old_version, new_version)
+
+    def _transform(self, row, columns):
+        return {col.name: value for value, col in zip(row, columns)}
+
+    @contextmanager
+    def _get_connection(self):
+        if self._closing:
+            raise Exception("Database is shutting down")

        try:
-            command.upgrade(config, target_rev)
-            logging.info(f"Database upgraded from {current_rev} to {target_rev}")
-        except Exception as e:
-            if backup_path:
-                # Restore the database from backup if upgrade fails
-                shutil.copy(backup_path, db_path)
-                os.remove(backup_path)
-            logging.exception("Error upgrading database: ")
-            raise e
+            # Try to get connection from pool
+            connection = self.connection_pool.get_nowait()
+        except Empty:
+            # Create new connection if pool is empty
+            connection = self._create_connection()

-    global Session
-    Session = sessionmaker(bind=engine)
+        try:
+            yield connection
+        finally:
+            try:
+                # Try to add to pool if it's empty
+                self.connection_pool.put_nowait(connection)
+            except Full:
+                # Pool is full, close the connection
+                connection.close()
+
+    @contextmanager
+    def get_connection(self):
+        # Setup the database if it's not already initialized
+        self._setup()
+        with self._get_connection() as connection:
+            yield connection
+
+    def execute(self, sql, *args):
+        with self.get_connection() as connection:
+            cursor = connection.execute(sql, args)
+            results = cursor.fetchall()
+            return results
+
+    def register_after_update_callback(self, callback):
+        self._after_update_callbacks.append(callback)
+
+    def close(self):
+        if self._closing:
+            return
+        # Drain and close all connections in the pool
+        self._closing = True
+        while True:
+            try:
+                conn = self.connection_pool.get_nowait()
+                conn.close()
+            except Empty:
+                break
+        self._closing = False
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass


-def create_session():
-    return Session()
+# Create a global instance
+db_path = None
+if not args.memory_database:
+    db_path = folder_paths.get_user_directory() + "/comfyui.db"
+db = Database(db_path)
--- a/app/database/entities.py
+++ b/app/database/entities.py
@@ -0,0 +1,343 @@
+from typing import Optional, Any, Callable
+from dataclasses import dataclass
+from functools import wraps
+from aiohttp import web
+from app.database.db import db
+
+primitives = (bool, str, int, float, type(None))
+
+
+def is_primitive(obj):
+    return isinstance(obj, primitives)
+
+
+class EntityError(Exception):
+    def __init__(
+        self, message: str, field: str = None, value: Any = None, status_code: int = 400
+    ):
+        self.message = message
+        self.field = field
+        self.value = value
+        self.status_code = status_code
+        super().__init__(self.message)
+
+    def to_json(self):
+        result = {"message": self.message}
+        if self.field is not None:
+            result["field"] = self.field
+        if self.value is not None:
+            result["value"] = self.value
+        return result
+
+    def __str__(self) -> str:
+        return f"{self.message} {self.field} {self.value}"
+
+
+class EntityCommon(dict):
+    @classmethod
+    def _get_route(cls, include_key: bool):
+        route = f"/db/{cls._table_name}"
+        if include_key:
+            route += "".join([f"/{{{k}}}" for k in cls._key_columns])
+        return route
+
+    @classmethod
+    def _register_route(cls, routes, verb: str, include_key: bool, handler: Callable):
+        route = cls._get_route(include_key)
+
+        @getattr(routes, verb)(route)
+        async def _(request):
+            try:
+                data = await handler(request)
+                if data is None:
+                    return web.json_response(status=204)
+
+                return web.json_response(data)
+            except EntityError as e:
+                return web.json_response(e.to_json(), status=e.status_code)
+
+    @classmethod
+    def _transform(cls, row: list[Any]):
+        return {col: value for col, value in zip(cls._columns, row)}
+
+    @classmethod
+    def _transform_rows(cls, rows: list[list[Any]]):
+        return [cls._transform(row) for row in rows]
+
+    @classmethod
+    def _extract_key(cls, request):
+        return {key: request.match_info.get(key, None) for key in cls._key_columns}
+
+    @classmethod
+    def _validate(cls, fields: list[str], data: dict, allow_missing: bool = False):
+        result = {}
+
+        if not isinstance(data, dict):
+            raise EntityError("Invalid data")
+
+        # Ensure all required fields are present
+        for field in data:
+            if field not in fields:
+                raise EntityError("Unknown field", field)
+
+        for key in fields:
+            col = cls._columns[key]
+            if key not in data:
+                if col.required and not allow_missing:
+                    raise EntityError("Missing field", key)
+                else:
+                    # e.g. for updates, we allow missing fields
+                    continue
+            elif data[key] is None and col.required:
+                # Dont allow None for required fields
+                raise EntityError("Required field", key)
+
+            # Validate data type
+            value = data[key]
+
+            if value is not None and not is_primitive(value):
+                raise EntityError("Invalid value", key, value)
+
+            try:
+                type = col.type
+                if value is not None and not isinstance(value, type):
+                    value = type(value)
+                result[key] = value
+            except Exception:
+                raise EntityError("Invalid value", key, value)
+
+        return result
+
+    @classmethod
+    def _validate_id(cls, id: dict):
+        return cls._validate(cls._key_columns, id)
+
+    @classmethod
+    def _validate_data(cls, data: dict, allow_missing: bool = False):
+        return cls._validate(cls._columns.keys(), data, allow_missing)
+
+    def __setattr__(self, name, value):
+        if name in self._columns:
+            self[name] = value
+        super().__setattr__(name, value)
+
+    def __getattr__(self, name):
+        if name in self:
+            return self[name]
+        raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
+
+
+class GetEntity(EntityCommon):
+    @classmethod
+    def get(cls, top: Optional[int] = None, where: Optional[str] = None):
+        limit = ""
+        if top is not None and isinstance(top, int):
+            limit = f" LIMIT {top}"
+        result = db.execute(
+            f"SELECT * FROM {cls._table_name}{limit}{f' WHERE {where}' if where else ''}",
+        )
+
+        # Map each row in result to an instance of the class
+        return cls._transform_rows(result)
+
+    @classmethod
+    def register_route(cls, routes):
+        async def get_handler(request):
+            top = request.rel_url.query.get("top", None)
+            if top is not None:
+                try:
+                    top = int(top)
+                except Exception:
+                    raise EntityError("Invalid top parameter", "top", top)
+            return cls.get(top)
+
+        cls._register_route(routes, "get", False, get_handler)
+
+
+class GetEntityById(EntityCommon):
+    @classmethod
+    def get_by_id(cls, id: dict):
+        id = cls._validate_id(id)
+
+        result = db.execute(
+            f"SELECT * FROM {cls._table_name} WHERE {cls._where_clause}",
+            *[id[key] for key in cls._key_columns],
+        )
+
+        return cls._transform_rows(result)
+
+    @classmethod
+    def register_route(cls, routes):
+        async def get_by_id_handler(request):
+            id = cls._extract_key(request)
+            return cls.get_by_id(id)
+
+        cls._register_route(routes, "get", True, get_by_id_handler)
+
+
+class CreateEntity(EntityCommon):
+    @classmethod
+    def create(cls, data: dict, allow_upsert: bool = False):
+        data = cls._validate_data(data)
+        values = ", ".join(["?"] * len(data))
+        on_conflict = ""
+
+        data_keys = ", ".join(list(data.keys()))
+        if allow_upsert:
+            # Remove key columns from data
+            upsert_keys = [key for key in data if key not in cls._key_columns]
+
+            set_clause = ", ".join([f"{k} = excluded.{k}" for k in upsert_keys])
+            on_conflict = f" ON CONFLICT ({', '.join(cls._key_columns)}) DO UPDATE SET {set_clause}"
+        sql = f"INSERT INTO {cls._table_name} ({data_keys}) VALUES ({values}){on_conflict} RETURNING *"
+        result = db.execute(
+            sql,
+            *[data[key] for key in data],
+        )
+
+        if len(result) == 0:
+            raise EntityError("Failed to create entity", status_code=500)
+
+        return cls._transform_rows(result)[0]
+
+    @classmethod
+    def register_route(cls, routes):
+        async def create_handler(request):
+            data = await request.json()
+            return cls.create(data)
+
+        cls._register_route(routes, "post", False, create_handler)
+
+
+class UpdateEntity(EntityCommon):
+    @classmethod
+    def update(cls, id: list, data: dict):
+        id = cls._validate_id(id)
+        data = cls._validate_data(data, allow_missing=True)
+
+        sql = f"UPDATE {cls._table_name} SET {', '.join([f'{k} = ?' for k in data])} WHERE {cls._where_clause} RETURNING *"
+        result = db.execute(
+            sql,
+            *[data[key] for key in data],
+            *[id[key] for key in cls._key_columns],
+        )
+
+        if len(result) == 0:
+            raise EntityError("Failed to update entity", status_code=404)
+
+        return cls._transform_rows(result)[0]
+
+    @classmethod
+    def register_route(cls, routes):
+        async def update_handler(request):
+            id = cls._extract_key(request)
+            data = await request.json()
+            return cls.update(id, data)
+
+        cls._register_route(routes, "patch", True, update_handler)
+
+
+class UpsertEntity(CreateEntity):
+    @classmethod
+    def upsert(cls, data: dict):
+        return cls.create(data, allow_upsert=True)
+
+    @classmethod
+    def register_route(cls, routes):
+        async def upsert_handler(request):
+            data = await request.json()
+            return cls.upsert(data)
+
+        cls._register_route(routes, "put", False, upsert_handler)
+
+
+class DeleteEntity(EntityCommon):
+    @classmethod
+    def delete(cls, id: list):
+        id = cls._validate_id(id)
+        db.execute(
+            f"DELETE FROM {cls._table_name} WHERE {cls._where_clause}",
+            *[id[key] for key in cls._key_columns],
+        )
+
+    @classmethod
+    def register_route(cls, routes):
+        async def delete_handler(request):
+            id = cls._extract_key(request)
+            cls.delete(id)
+
+        cls._register_route(routes, "delete", True, delete_handler)
+
+
+class BaseEntity(GetEntity, CreateEntity, UpdateEntity, DeleteEntity, GetEntityById):
+    pass
+
+
+@dataclass
+class Column:
+    type: Any
+    required: bool = False
+    key: bool = False
+    default: Any = None
+
+
+def column(type_: Any, required: bool = False, key: bool = False, default: Any = None):
+    return Column(type_, required, key, default)
+
+
+def table(table_name: str):
+    def decorator(cls):
+        # Store table name
+        cls._table_name = table_name
+
+        # Process column definitions
+        columns: dict[str, Column] = {}
+        for attr_name, attr_value in cls.__dict__.items():
+            if isinstance(attr_value, Column):
+                columns[attr_name] = attr_value
+
+        # Store columns metadata
+        cls._columns = columns
+        cls._key_columns = [col for col in columns if columns[col].key]
+        cls._column_csv = ", ".join([col for col in columns])
+        cls._where_clause = " AND ".join([f"{col} = ?" for col in cls._key_columns])
+
+        # Add initialization
+        original_init = cls.__init__
+
+        @wraps(original_init)
+        def new_init(self, *args, **kwargs):
+            # Initialize columns with default values
+            for col_name, col_def in cls._columns.items():
+                setattr(self, col_name, col_def.default)
+            # Call original init
+            original_init(self, *args, **kwargs)
+
+        cls.__init__ = new_init
+        return cls
+
+    return decorator
+
+
+def test():
+    @table("models")
+    class Model(BaseEntity):
+        id: int = column(int, required=True, key=True)
+        path: str = column(str, required=True)
+        name: str = column(str, required=True)
+        description: Optional[str] = column(str)
+        architecture: Optional[str] = column(str)
+        type: str = column(str, required=True)
+        hash: Optional[str] = column(str)
+        source_url: Optional[str] = column(str)
+
+    return Model
+
+
+@table("test")
+class Test(GetEntity, CreateEntity):
+    id: int = column(int, required=True, key=True)
+    test: str = column(str, required=True)
+
+
+Model = test()
--- a/app/database/models.py
+++ b/app/database/models.py
@@ -1,14 +0,0 @@
-from sqlalchemy.orm import declarative_base
-
-Base = declarative_base()
-
-
-def to_dict(obj):
-    fields = obj.__table__.columns.keys()
-    return {
-        field: (val.to_dict() if hasattr(val, "to_dict") else val)
-        for field in fields
-        if (val := getattr(obj, field))
-    }
-
-# TODO: Define models here
--- a/app/database/routes.py
+++ b/app/database/routes.py
@@ -0,0 +1,32 @@
+from app.database.db import db
+from aiohttp import web
+
+def create_routes(
+    routes, prefix, entity, get=False, get_by_id=False, post=False, delete=False
+):
+    if get:
+        @routes.get(f"/{prefix}/{table}")
+        async def get_table(request):
+            connection = db.get_connection()
+            cursor = connection.cursor()
+            cursor.execute(f"SELECT * FROM {table}")
+            rows = cursor.fetchall()
+            return web.json_response(rows)
+        
+    if get_by_id:
+        @routes.get(f"/{prefix}/{table}/{id}")
+        async def get_table_by_id(request):
+            connection = db.get_connection()
+            cursor = connection.cursor()
+            cursor.execute(f"SELECT * FROM {table} WHERE id = {id}")
+            row = cursor.fetchone()
+            return web.json_response(row)
+        
+    if post:
+        @routes.post(f"/{prefix}/{table}")
+        async def post_table(request):
+            data = await request.json()
+            connection = db.get_connection()
+            cursor = connection.cursor()
+            cursor.execute(f"INSERT INTO {table} ({data}) VALUES ({data})")
+            return web.json_response({"status": "success"})
--- a/app/database/updater.py
+++ b/app/database/updater.py
@@ -0,0 +1,79 @@
+import logging
+import os
+import sqlite3
+from app.database.versions.v1 import v1
+
+
+class DatabaseUpdater:
+    def __init__(self, connection, database_path):
+        self.connection = connection
+        self.database_path = database_path
+        self.current_version = self.get_db_version()
+        self.version_updates = {
+            1: v1,
+        }
+        self.max_version = max(self.version_updates.keys())
+        self.update_required = self.current_version < self.max_version
+        logging.info(f"Database version: {self.current_version}")
+
+    def get_db_version(self):
+        return self.connection.execute("PRAGMA user_version").fetchone()[0]
+
+    def backup(self):
+        bkp_path = self.database_path + ".bkp"
+        if os.path.exists(bkp_path):
+            # TODO: auto-rollback failed upgrades
+            raise Exception(
+                f"Database backup already exists, this indicates that a previous upgrade failed. Please restore this backup before continuing. Backup location: {bkp_path}"
+            )
+
+        bkp = sqlite3.connect(bkp_path)
+        self.connection.backup(bkp)
+        bkp.close()
+        logging.info("Database backup taken pre-upgrade.")
+        return bkp_path
+
+    def update(self):
+        if not self.update_required:
+            return None
+
+        bkp_version = self.current_version
+        bkp_path = None
+        if self.current_version > 0:
+            bkp_path = self.backup()
+
+        logging.info(f"Updating database: {self.current_version} -> {self.max_version}")
+
+        dirname = os.path.dirname(__file__)
+        cursor = self.connection.cursor()
+        for version in range(self.current_version + 1, self.max_version + 1):
+            filename = os.path.join(dirname, f"versions/v{version}.sql")
+            if not os.path.exists(filename):
+                raise Exception(
+                    f"Database update script for version {version} not found"
+                )
+
+            try:
+                with open(filename, "r") as file:
+                    sql = file.read()
+                    cursor.executescript(sql)
+            except Exception as e:
+                raise Exception(
+                    f"Failed to execute update script for version {version}: {e}"
+                )
+
+            method = self.version_updates[version]
+            if method is not None:
+                method(cursor)
+
+        cursor.execute("PRAGMA user_version = %d" % self.max_version)
+        self.connection.commit()
+        cursor.close()
+        self.current_version = self.get_db_version()
+
+        if bkp_path:
+            # Keep a copy of the backup in case something goes wrong and we need to rollback
+            os.rename(bkp_path, self.database_path + f".v{bkp_version}.bkp")
+        logging.info(f"Upgrade to successful.")
+
+        return (bkp_version, self.current_version)
--- a/app/database/versions/v1.py
+++ b/app/database/versions/v1.py
@@ -0,0 +1,17 @@
+from folder_paths import folder_names_and_paths, get_filename_list, get_full_path
+
+
+def v1(cursor):
+    print("Updating to v1")
+    for folder_name in folder_names_and_paths.keys():
+        if folder_name == "custom_nodes":
+            continue
+
+        files = get_filename_list(folder_name)
+        for file in files:
+            file_path = get_full_path(folder_name, file)
+            file_without_extension = file.rsplit(".", maxsplit=1)[0]
+            cursor.execute(
+                "INSERT INTO models (path, name, type) VALUES (?, ?, ?)",
+                (file_path, file_without_extension, folder_name),
+            )
--- a/app/database/versions/v1.sql
+++ b/app/database/versions/v1.sql
@@ -0,0 +1,41 @@
+CREATE TABLE IF NOT EXISTS
+    models (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        path TEXT NOT NULL,
+        name TEXT NOT NULL,
+        description TEXT,
+        architecture TEXT,
+        type TEXT NOT NULL,
+        hash TEXT,
+        source_url TEXT
+    );
+
+CREATE TABLE IF NOT EXISTS
+    tags (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        name TEXT NOT NULL UNIQUE
+    );
+
+CREATE TABLE IF NOT EXISTS
+    model_tags (
+        model_id INTEGER NOT NULL,
+        tag_id INTEGER NOT NULL,
+        PRIMARY KEY (model_id, tag_id),
+        FOREIGN KEY (model_id) REFERENCES models (id) ON DELETE CASCADE,
+        FOREIGN KEY (tag_id) REFERENCES tags (id) ON DELETE CASCADE
+    );
+
+INSERT INTO
+    tags (name)
+VALUES
+    ('character'),
+    ('style'),
+    ('concept'),
+    ('clothing'),
+    ('poses'),
+    ('background'),
+    ('vehicle'),
+    ('buildings'),
+    ('objects'),
+    ('animal'),
+    ('action');
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -3,90 +3,16 @@ import argparse
 import logging
 import os
 import re
-import sys
 import tempfile
 import zipfile
-import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
-from importlib.metadata import version

 import requests
 from typing_extensions import NotRequired
-
-from utils.install_util import get_missing_requirements_message, requirements_path
-
 from comfy.cli_args import DEFAULT_VERSION_STRING
-import app.logger
-
-
-def frontend_install_warning_message():
-    return f"""
-{get_missing_requirements_message()}
-
-This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
-""".strip()
-
-def parse_version(version: str) -> tuple[int, int, int]:
-        return tuple(map(int, version.split(".")))
-
-def is_valid_version(version: str) -> bool:
-    """Validate if a string is a valid semantic version (X.Y.Z format)."""
-    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
-    return bool(re.match(pattern, version))
-
-def get_installed_frontend_version():
-    """Get the currently installed frontend package version."""
-    frontend_version_str = version("comfyui-frontend-package")
-    return frontend_version_str
-
-def get_required_frontend_version():
-    """Get the required frontend version from requirements.txt."""
-    try:
-        with open(requirements_path, "r", encoding="utf-8") as f:
-            for line in f:
-                line = line.strip()
-                if line.startswith("comfyui-frontend-package=="):
-                    version_str = line.split("==")[-1]
-                    if not is_valid_version(version_str):
-                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
-                        return None
-                    return version_str
-            logging.error("comfyui-frontend-package not found in requirements.txt")
-            return None
-    except FileNotFoundError:
-        logging.error("requirements.txt not found. Cannot determine required frontend version.")
-        return None
-    except Exception as e:
-        logging.error(f"Error reading requirements.txt: {e}")
-        return None
-
-def check_frontend_version():
-    """Check if the frontend version is up to date."""
-
-    try:
-        frontend_version_str = get_installed_frontend_version()
-        frontend_version = parse_version(frontend_version_str)
-        required_frontend_str = get_required_frontend_version()
-        required_frontend = parse_version(required_frontend_str)
-        if frontend_version < required_frontend:
-            app.logger.log_startup_warning(
-                f"""
-________________________________________________________________________
-WARNING WARNING WARNING WARNING WARNING
-
-Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
-
-{frontend_install_warning_message()}
-________________________________________________________________________
-""".strip()
-            )
-        else:
-            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
-    except Exception as e:
-        logging.error(f"Failed to check frontend version: {e}")


 REQUEST_TIMEOUT = 10  # seconds
@@ -142,22 +68,9 @@ class FrontEndProvider:
        response.raise_for_status()  # Raises an HTTPError if the response was an error
        return response.json()

-    @cached_property
-    def latest_prerelease(self) -> Release:
-        """Get the latest pre-release version - even if it's older than the latest release"""
-        release = [release for release in self.all_releases if release["prerelease"]]
-
-        if not release:
-            raise ValueError("No pre-releases found")
-
-        # GitHub returns releases in reverse chronological order, so first is latest
-        return release[0]
-
    def get_release(self, version: str) -> Release:
        if version == "latest":
            return self.latest_release
-        elif version == "prerelease":
-            return self.latest_prerelease
        else:
            for release in self.all_releases:
                if release["tag_name"] in [version, f"v{version}"]:
@@ -196,67 +109,9 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:


 class FrontendManager:
+    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")

-    @classmethod
-    def get_required_frontend_version(cls) -> str:
-        """Get the required frontend package version."""
-        return get_required_frontend_version()
-
-    @classmethod
-    def default_frontend_path(cls) -> str:
-        try:
-            import comfyui_frontend_package
-
-            return str(importlib.resources.files(comfyui_frontend_package) / "static")
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-frontend-package is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-            sys.exit(-1)
-
-    @classmethod
-    def templates_path(cls) -> str:
-        try:
-            import comfyui_workflow_templates
-
-            return str(
-                importlib.resources.files(comfyui_workflow_templates) / "templates"
-            )
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-workflow-templates is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-
-    @classmethod
-    def embedded_docs_path(cls) -> str:
-        """Get the path to embedded documentation"""
-        try:
-            import comfyui_embedded_docs
-
-            return str(
-                importlib.resources.files(comfyui_embedded_docs) / "docs"
-            )
-        except ImportError:
-            logging.info("comfyui-embedded-docs package not found")
-            return None
-
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@@ -269,7 +124,7 @@ comfyui-workflow-templates is not installed.
        Raises:
            argparse.ArgumentTypeError: If the version string is invalid.
        """
-        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+[-._a-zA-Z0-9]*|latest|prerelease)$"
+        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$"
        match_result = re.match(VERSION_PATTERN, value)
        if match_result is None:
            raise argparse.ArgumentTypeError(f"Invalid version string: {value}")
@@ -277,9 +132,7 @@ comfyui-workflow-templates is not installed.
        return match_result.group(1), match_result.group(2), match_result.group(3)

    @classmethod
-    def init_frontend_unsafe(
-        cls, version_string: str, provider: Optional[FrontEndProvider] = None
-    ) -> str:
+    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
        """
        Initializes the frontend for the specified version.

@@ -295,26 +148,17 @@ comfyui-workflow-templates is not installed.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH

        repo_owner, repo_name, version = cls.parse_version_string(version_string)

        if version.startswith("v"):
-            expected_path = str(
-                Path(cls.CUSTOM_FRONTENDS_ROOT)
-                / f"{repo_owner}_{repo_name}"
-                / version.lstrip("v")
-            )
+            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
            if os.path.exists(expected_path):
-                logging.info(
-                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
-                )
+                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
                return expected_path

-        logging.info(
-            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
-        )
+        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")

        provider = provider or FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)
@@ -357,5 +201,4 @@ comfyui-workflow-templates is not installed.
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH
--- a/app/logger.py
+++ b/app/logger.py
@@ -82,17 +82,3 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
        logger.addHandler(stdout_handler)

    logger.addHandler(stream_handler)
-
-
-STARTUP_WARNINGS = []
-
-
-def log_startup_warning(msg):
-    logging.warning(msg)
-    STARTUP_WARNINGS.append(msg)
-
-
-def print_startup_warnings():
-    for s in STARTUP_WARNINGS:
-        logging.warning(s)
-    STARTUP_WARNINGS.clear()
--- a/app/model_hasher.py
+++ b/app/model_hasher.py
@@ -0,0 +1,63 @@
+import hashlib
+import logging
+import threading
+import time
+from comfy.cli_args import args
+
+
+class ModelHasher:
+
+    def __init__(self):
+        self._thread = None
+        self._lock = threading.Lock()
+        self._model_entity = None
+
+    def start(self, model_entity):
+        if args.disable_model_hashing:
+            return
+
+        self._model_entity = model_entity
+
+        if self._thread is None:
+            # Lock to prevent multiple threads from starting
+            with self._lock:
+                if self._thread is None:
+                    self._thread = threading.Thread(target=self._hash_models)
+                    self._thread.daemon = True
+                    self._thread.start()
+
+    def _get_models(self):
+        models = self._model_entity.get("WHERE hash IS NULL")
+        return models
+
+    def _hash_model(self, model_path):
+        h = hashlib.sha256()
+        b = bytearray(128 * 1024)
+        mv = memoryview(b)
+        with open(model_path, "rb", buffering=0) as f:
+            while n := f.readinto(mv):
+                h.update(mv[:n])
+        hash = h.hexdigest()
+        return hash
+
+    def _hash_models(self):
+        while True:
+            models = self._get_models()
+
+            if len(models) == 0:
+                break
+
+            for model in models:
+                time.sleep(0)
+                now = time.time()
+                logging.info(f"Hashing model {model['path']}")
+                hash = self._hash_model(model["path"])
+                logging.info(
+                    f"Hashed model {model['path']} in {time.time() - now} seconds"
+                )
+                self._model_entity.update((model["id"],), {"hash": hash})
+
+        self._thread = None
+
+
+model_hasher = ModelHasher()
--- a/app/model_manager.py
+++ b/app/model_manager.py
@@ -130,21 +130,10 @@ class ModelFileManager:

            for file_name in filenames:
                try:
-                    full_path = os.path.join(dirpath, file_name)
-                    relative_path = os.path.relpath(full_path, directory)
-
-                    # Get file metadata
-                    file_info = {
-                        "name": relative_path,
-                        "pathIndex": pathIndex,
-                        "modified": os.path.getmtime(full_path),  # Add modification time
-                        "created": os.path.getctime(full_path),   # Add creation time
-                        "size": os.path.getsize(full_path)        # Add file size
-                    }
-                    result.append(file_info)
-
-                except Exception as e:
-                    logging.warning(f"Warning: Unable to access {file_name}. Error: {e}. Skipping this file.")
+                    relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory)
+                    result.append(relative_path)
+                except:
+                    logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.")
                    continue

            for d in subdirs:
@@ -155,7 +144,7 @@ class ModelFileManager:
                    logging.warning(f"Warning: Unable to access {path}. Skipping this path.")
                    continue

-        return result, dirs, time.perf_counter()
+        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()

    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
        dirname = os.path.dirname(filepath)
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -20,15 +20,13 @@ class FileInfo(TypedDict):
    path: str
    size: int
    modified: int
-    created: int


 def get_file_info(path: str, relative_to: str) -> FileInfo:
    return {
        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
        "size": os.path.getsize(path),
-        "modified": os.path.getmtime(path),
-        "created": os.path.getctime(path)
+        "modified": os.path.getmtime(path)
    }


@@ -199,112 +197,6 @@ class UserManager():

            return web.json_response(results)

-        @routes.get("/v2/userdata")
-        async def list_userdata_v2(request):
-            """
-            List files and directories in a user's data directory.
-
-            This endpoint provides a structured listing of contents within a specified
-            subdirectory of the user's data storage.
-
-            Query Parameters:
-            - path (optional): The relative path within the user's data directory
-                               to list. Defaults to the root ('').
-
-            Returns:
-            - 400: If the requested path is invalid, outside the user's data directory, or is not a directory.
-            - 404: If the requested path does not exist.
-            - 403: If the user is invalid.
-            - 500: If there is an error reading the directory contents.
-            - 200: JSON response containing a list of file and directory objects.
-                   Each object includes:
-                   - name: The name of the file or directory.
-                   - type: 'file' or 'directory'.
-                   - path: The relative path from the user's data root.
-                   - size (for files): The size in bytes.
-                   - modified (for files): The last modified timestamp (Unix epoch).
-            """
-            requested_rel_path = request.rel_url.query.get('path', '')
-
-            # URL-decode the path parameter
-            try:
-                requested_rel_path = parse.unquote(requested_rel_path)
-            except Exception as e:
-                logging.warning(f"Failed to decode path parameter: {requested_rel_path}, Error: {e}")
-                return web.Response(status=400, text="Invalid characters in path parameter")
-
-
-            # Check user validity and get the absolute path for the requested directory
-            try:
-                 base_user_path = self.get_request_user_filepath(request, None, create_dir=False)
-
-                 if requested_rel_path:
-                     target_abs_path = self.get_request_user_filepath(request, requested_rel_path, create_dir=False)
-                 else:
-                     target_abs_path = base_user_path
-
-            except KeyError as e:
-                 # Invalid user detected by get_request_user_id inside get_request_user_filepath
-                 logging.warning(f"Access denied for user: {e}")
-                 return web.Response(status=403, text="Invalid user specified in request")
-
-
-            if not target_abs_path:
-                 # Path traversal or other issue detected by get_request_user_filepath
-                 return web.Response(status=400, text="Invalid path requested")
-
-            # Handle cases where the user directory or target path doesn't exist
-            if not os.path.exists(target_abs_path):
-                # Check if it's the base user directory that's missing (new user case)
-                if target_abs_path == base_user_path:
-                    # It's okay if the base user directory doesn't exist yet, return empty list
-                     return web.json_response([])
-                else:
-                    # A specific subdirectory was requested but doesn't exist
-                     return web.Response(status=404, text="Requested path not found")
-
-            if not os.path.isdir(target_abs_path):
-                 return web.Response(status=400, text="Requested path is not a directory")
-
-            results = []
-            try:
-                for root, dirs, files in os.walk(target_abs_path, topdown=True):
-                    # Process directories
-                    for dir_name in dirs:
-                        dir_path = os.path.join(root, dir_name)
-                        rel_path = os.path.relpath(dir_path, base_user_path).replace(os.sep, '/')
-                        results.append({
-                            "name": dir_name,
-                            "path": rel_path,
-                            "type": "directory"
-                        })
-
-                    # Process files
-                    for file_name in files:
-                        file_path = os.path.join(root, file_name)
-                        rel_path = os.path.relpath(file_path, base_user_path).replace(os.sep, '/')
-                        entry_info = {
-                            "name": file_name,
-                            "path": rel_path,
-                            "type": "file"
-                        }
-                        try:
-                            stats = os.stat(file_path) # Use os.stat for potentially better performance with os.walk
-                            entry_info["size"] = stats.st_size
-                            entry_info["modified"] = stats.st_mtime
-                        except OSError as stat_error:
-                            logging.warning(f"Could not stat file {file_path}: {stat_error}")
-                            pass # Include file with available info
-                        results.append(entry_info)
-            except OSError as e:
-                logging.error(f"Error listing directory {target_abs_path}: {e}")
-                return web.Response(status=500, text="Error reading directory contents")
-
-            # Sort results alphabetically, directories first then files
-            results.sort(key=lambda x: (x['type'] != 'directory', x['name'].lower()))
-
-            return web.json_response(results)
-
        def get_user_data_path(request, check_exists = False, param = "file"):
            file = request.match_info.get(param, None)
            if not file:
@@ -363,17 +255,10 @@ class UserManager():
            if not overwrite and os.path.exists(path):
                return web.Response(status=409, text="File already exists")

-            try:
-                body = await request.read()
+            body = await request.read()

-                with open(path, "wb") as f:
-                    f.write(body)
-            except OSError as e:
-                logging.warning(f"Error saving file '{path}': {e}")
-                return web.Response(
-                    status=400,
-                    reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|"
-                )
+            with open(path, "wb") as f:
+                f.write(body)

            user_path = self.get_request_user_filepath(request, None)
            if full_info:
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -1,42 +0,0 @@
-from .wav2vec2 import Wav2Vec2Model
-import comfy.model_management
-import comfy.ops
-import comfy.utils
-import logging
-import torchaudio
-
-
-class AudioEncoderModel():
-    def __init__(self, config):
-        self.load_device = comfy.model_management.text_encoder_device()
-        offload_device = comfy.model_management.text_encoder_offload_device()
-        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
-        self.model.eval()
-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
-        self.model_sample_rate = 16000
-
-    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=False)
-
-    def get_sd(self):
-        return self.model.state_dict()
-
-    def encode_audio(self, audio, sample_rate):
-        comfy.model_management.load_model_gpu(self.patcher)
-        audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
-        out, all_layers = self.model(audio.to(self.load_device))
-        outputs = {}
-        outputs["encoded_audio"] = out
-        outputs["encoded_audio_all_layers"] = all_layers
-        return outputs
-
-
-def load_audio_encoder_from_sd(sd, prefix=""):
-    audio_encoder = AudioEncoderModel(None)
-    sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
-    m, u = audio_encoder.load_sd(sd)
-    if len(m) > 0:
-        logging.warning("missing audio encoder: {}".format(m))
-
-    return audio_encoder
--- a/comfy/audio_encoders/wav2vec2.py
+++ b/comfy/audio_encoders/wav2vec2.py
@@ -1,207 +0,0 @@
-import torch
-import torch.nn as nn
-from comfy.ldm.modules.attention import optimized_attention_masked
-
-
-class LayerNormConv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
-        self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))
-
-
-class ConvFeatureEncoder(nn.Module):
-    def __init__(self, conv_dim, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.conv_layers = nn.ModuleList([
-            LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-        ])
-
-    def forward(self, x):
-        x = x.unsqueeze(1)
-
-        for conv in self.conv_layers:
-            x = conv(x)
-
-        return x.transpose(1, 2)
-
-
-class FeatureProjection(nn.Module):
-    def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype)
-        self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.layer_norm(x)
-        x = self.projection(x)
-        return x
-
-
-class PositionalConvEmbedding(nn.Module):
-    def __init__(self, embed_dim=768, kernel_size=128, groups=16):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            embed_dim,
-            embed_dim,
-            kernel_size=kernel_size,
-            padding=kernel_size // 2,
-            groups=groups,
-        )
-        self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
-        self.activation = nn.GELU()
-
-    def forward(self, x):
-        x = x.transpose(1, 2)
-        x = self.conv(x)[:, :, :-1]
-        x = self.activation(x)
-        x = x.transpose(1, 2)
-        return x
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(
-        self,
-        embed_dim=768,
-        num_heads=12,
-        num_layers=12,
-        mlp_ratio=4.0,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim)
-        self.layers = nn.ModuleList([
-            TransformerEncoderLayer(
-                embed_dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                device=device, dtype=dtype, operations=operations
-            )
-            for _ in range(num_layers)
-        ])
-
-        self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
-
-    def forward(self, x, mask=None):
-        x = x + self.pos_conv_embed(x)
-        all_x = ()
-        for layer in self.layers:
-            all_x += (x,)
-            x = layer(x, mask)
-        x = self.layer_norm(x)
-        all_x += (x,)
-        return x, all_x
-
-
-class Attention(nn.Module):
-    def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-
-        self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
-        self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
-        self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
-        self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
-
-    def forward(self, x, mask=None):
-        assert (mask is None)  # TODO?
-        q = self.q_proj(x)
-        k = self.k_proj(x)
-        v = self.v_proj(x)
-
-        out = optimized_attention_masked(q, k, v, self.num_heads)
-        return self.out_proj(out)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype)
-        self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.intermediate_dense(x)
-        x = torch.nn.functional.gelu(x)
-        x = self.output_dense(x)
-        return x
-
-
-class TransformerEncoderLayer(nn.Module):
-    def __init__(
-        self,
-        embed_dim=768,
-        num_heads=12,
-        mlp_ratio=4.0,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations)
-
-        self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
-        self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
-        self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
-
-    def forward(self, x, mask=None):
-        residual = x
-        x = self.layer_norm(x)
-        x = self.attention(x, mask=mask)
-        x = residual + x
-
-        x = x + self.feed_forward(self.final_layer_norm(x))
-        return x
-
-
-class Wav2Vec2Model(nn.Module):
-    """Complete Wav2Vec 2.0 model."""
-
-    def __init__(
-        self,
-        embed_dim=1024,
-        final_dim=256,
-        num_heads=16,
-        num_layers=24,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        conv_dim = 512
-        self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
-        self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)
-
-        self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
-
-        self.encoder = TransformerEncoder(
-            embed_dim=embed_dim,
-            num_heads=num_heads,
-            num_layers=num_layers,
-            device=device, dtype=dtype, operations=operations
-        )
-
-    def forward(self, x, mask_time_indices=None, return_dict=False):
-
-        x = torch.mean(x, dim=1)
-
-        x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
-
-        features = self.feature_extractor(x)
-        features = self.feature_projection(features)
-
-        batch_size, seq_len, _ = features.shape
-
-        x, all_x = self.encoder(features)
-
-        return x, all_x
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
+from typing import Optional
 import comfy.options


@@ -42,15 +43,13 @@ parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certific
 parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
 parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")

-parser.add_argument("--base-directory", type=str, default=None, help="Set the ComfyUI base directory for models, custom_nodes, input, output, temp, and user directories.")
 parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory. Overrides --base-directory.")
-parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory). Overrides --base-directory.")
-parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
+parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
+parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
+parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
-parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
+parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
@@ -67,7 +66,6 @@ fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diff
 fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
-fpunet_group.add_argument("--fp8_e8m0fnu-unet", action="store_true", help="Store unet weights in fp8_e8m0fnu.")

 fpvae_group = parser.add_mutually_exclusive_group()
 fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
@@ -81,7 +79,6 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
 fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
 fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
-fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

@@ -89,7 +86,6 @@ parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE"

 parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
 parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
-parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")

 class LatentPreviewMethod(enum.Enum):
    NoPreviews = "none"
@@ -104,14 +100,12 @@ parser.add_argument("--preview-size", type=int, default=512, help="Sets the maxi
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
-cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
 attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
-attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")

 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")

@@ -130,25 +124,12 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e

 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")

-parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
-
-parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")

 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-
-class PerformanceFeature(enum.Enum):
-    Fp16Accumulation = "fp16_accumulation"
-    Fp8MatrixMultiplication = "fp8_matrix_mult"
-    CublasOps = "cublas_ops"
-    AutoTune = "autotune"
-
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
-
-parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
-parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
+parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")

 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@@ -156,17 +137,19 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win

 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
-parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
 parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")

+parser.add_argument("--memory-database", default=False, action="store_true", help="Use an in-memory database instead of a file-based one.")
+parser.add_argument("--disable-model-hashing", action="store_true", help="Disable model hashing.")
+
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"

+
 parser.add_argument(
    "--front-end-version",
    type=str,
@@ -181,14 +164,13 @@ parser.add_argument(
    """,
 )

-def is_valid_directory(path: str) -> str:
-    """Validate if the given path is a directory, and check permissions."""
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
+    """Validate if the given path is a directory."""
+    if path is None:
+        return None
+
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
-    if not os.access(path, os.R_OK):
-        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    return path

 parser.add_argument(
@@ -198,21 +180,7 @@ parser.add_argument(
    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
 )

-parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
-
-parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
-
-parser.add_argument(
-    "--comfy-api-base",
-    type=str,
-    default="https://api.comfy.org",
-    help="Set the base URL for the ComfyUI API.  (default: https://api.comfy.org)",
-)
-
-database_default_path = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "..", "user", "comfyui.db")
-)
-parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
+parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
@@ -224,17 +192,3 @@ if args.windows_standalone_build:

 if args.disable_auto_launch:
    args.auto_launch = False
-
-if args.force_fp16:
-    args.fp16_unet = True
-
-
-# '--fast' is not provided, use an empty set
-if args.fast is None:
-    args.fast = set()
-# '--fast' is provided with an empty list, enable all optimizations
-elif args.fast == []:
-    args.fast = set(PerformanceFeature)
-# '--fast' is provided with a list of performance features, use that list
-else:
-    args.fast = set(args.fast)
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,19 +97,14 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32, embeds_info=[]):
-        if embeds is not None:
-            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
-        else:
-            x = self.embeddings(input_tokens, dtype=dtype)
-
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
-
-        causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1)
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))

+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
        if mask is not None:
            mask += causal_mask
        else:
@@ -120,10 +115,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        if num_tokens is not None:
-            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
-        else:
-            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

 class CLIPTextModel(torch.nn.Module):
@@ -211,15 +203,6 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

-class LlavaProjector(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, dtype, device, operations):
-        super().__init__()
-        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
-        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
-
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@@ -229,16 +212,7 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a

-        if "llava3" == config_dict.get("projector_type", None):
-            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
-        else:
-            self.multi_modal_projector = None
-
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
-        if self.multi_modal_projector is not None:
-            projected = self.multi_modal_projector(x[1])
-
-        return (x[0], x[1], out, projected)
+        return (x[0], x[1], out)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -9,7 +9,6 @@ import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
 import comfy.clip_model
-import comfy.image_encoders.dino2

 class Output:
    def __getitem__(self, key):
@@ -18,7 +17,6 @@ class Output:
        setattr(self, key, item)

 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
-    image = image[:, :, :, :3] if image.shape[3] > 3 else image
    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
    std = torch.tensor(std, device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
@@ -36,12 +34,6 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])

-IMAGE_ENCODERS = {
-    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
-}
-
 class ClipVisionModel():
    def __init__(self, json_config):
        with open(json_config) as f:
@@ -50,11 +42,10 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()

        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
@@ -74,7 +65,6 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
-        outputs["mm_projected"] = out[3]
        return outputs

 def convert_to_transformers(sd, prefix):
@@ -111,21 +101,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            if embed_shape == 729:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-            elif embed_shape == 1024:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
-        elif embed_shape == 577:
-            if "multi_modal_projector.linear_1.bias" in sd:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
-            else:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    elif "embeddings.patch_embeddings.projection.weight" in sd:
-        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
        return None

--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -1,19 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 1024,
-  "image_size": 336,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-5,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 14,
-  "projection_dim": 768,
-  "projector_type": "llava3",
-  "torch_dtype": "float32"
-}
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@@ -1,13 +0,0 @@
-{
-  "num_channels": 3,
-  "hidden_act": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
-  "image_size": 512,
-  "intermediate_size": 4304,
-  "model_type": "siglip_vision_model",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 27,
-  "patch_size": 16,
-  "image_mean": [0.5, 0.5, 0.5],
-  "image_std": [0.5, 0.5, 0.5]
-}
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin


 class UnetApplyFunction(Protocol):
@@ -42,5 +42,4 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
-    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -1,8 +1,7 @@
 """Comfy-specific type hinting"""

 from __future__ import annotations
-from typing import Literal, TypedDict, Optional
-from typing_extensions import NotRequired
+from typing import Literal, TypedDict
 from abc import ABC, abstractmethod
 from enum import Enum

@@ -27,7 +26,6 @@ class IO(StrEnum):
    BOOLEAN = "BOOLEAN"
    INT = "INT"
    FLOAT = "FLOAT"
-    COMBO = "COMBO"
    CONDITIONING = "CONDITIONING"
    SAMPLER = "SAMPLER"
    SIGMAS = "SIGMAS"
@@ -37,8 +35,6 @@ class IO(StrEnum):
    CONTROL_NET = "CONTROL_NET"
    VAE = "VAE"
    MODEL = "MODEL"
-    LORA_MODEL = "LORA_MODEL"
-    LOSS_MAP = "LOSS_MAP"
    CLIP_VISION = "CLIP_VISION"
    CLIP_VISION_OUTPUT = "CLIP_VISION_OUTPUT"
    STYLE_MODEL = "STYLE_MODEL"
@@ -50,7 +46,6 @@ class IO(StrEnum):
    FACE_ANALYSIS = "FACE_ANALYSIS"
    BBOX = "BBOX"
    SEGS = "SEGS"
-    VIDEO = "VIDEO"

    ANY = "*"
    """Always matches any type, but at a price.
@@ -72,148 +67,90 @@ class IO(StrEnum):
        return not (b.issubset(a) or a.issubset(b))


-class RemoteInputOptions(TypedDict):
-    route: str
-    """The route to the remote source."""
-    refresh_button: bool
-    """Specifies whether to show a refresh button in the UI below the widget."""
-    control_after_refresh: Literal["first", "last"]
-    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
-    timeout: int
-    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
-    max_retries: int
-    """The maximum number of retries before aborting the request."""
-    refresh: int
-    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
-
-
-class MultiSelectOptions(TypedDict):
-    placeholder: NotRequired[str]
-    """The placeholder text to display in the multi-select widget when no items are selected."""
-    chip: NotRequired[bool]
-    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
-
-
 class InputTypeOptions(TypedDict):
    """Provides type hinting for the return type of the INPUT_TYPES node function.

    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_datatypes
    """

-    default: NotRequired[bool | str | float | int | list | tuple]
+    default: bool | str | float | int | list | tuple
    """The default value of the widget"""
-    defaultInput: NotRequired[bool]
-    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
-    - defaultInput on required inputs should be dropped.
-    - defaultInput on optional inputs should be replaced with forceInput.
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
-    """
-    forceInput: NotRequired[bool]
-    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
-    lazy: NotRequired[bool]
+    defaultInput: bool
+    """Defaults to an input slot rather than a widget"""
+    forceInput: bool
+    """`defaultInput` and also don't allow converting to a widget"""
+    lazy: bool
    """Declares that this input uses lazy evaluation"""
-    rawLink: NotRequired[bool]
+    rawLink: bool
    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
-    tooltip: NotRequired[str]
+    tooltip: str
    """Tooltip for the input (or widget), shown on pointer hover"""
-    socketless: NotRequired[bool]
-    """All inputs (including widgets) have an input socket to connect links. When ``true``, if there is a widget for this input, no socket will be created.
-    Available from frontend v1.17.5
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3548
-    """
-    widgetType: NotRequired[str]
-    """Specifies a type to be used for widget initialization if different from the input type.
-    Available from frontend v1.18.0
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/3550"""
    # class InputTypeNumber(InputTypeOptions):
    # default: float | int
-    min: NotRequired[float]
+    min: float
    """The minimum value of a number (``FLOAT`` | ``INT``)"""
-    max: NotRequired[float]
+    max: float
    """The maximum value of a number (``FLOAT`` | ``INT``)"""
-    step: NotRequired[float]
+    step: float
    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
-    round: NotRequired[float]
+    round: float
    """Floats are rounded by this value (``FLOAT``)"""
    # class InputTypeBoolean(InputTypeOptions):
    # default: bool
-    label_on: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
-    multiline: NotRequired[bool]
+    multiline: bool
    """Use a multiline text box (``STRING``)"""
-    placeholder: NotRequired[str]
+    placeholder: str
    """Placeholder text to display in the UI when empty (``STRING``)"""
    # Deprecated:
    # defaultVal: str
-    dynamicPrompts: NotRequired[bool]
+    dynamicPrompts: bool
    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
-    # class InputTypeCombo(InputTypeOptions):
-    image_upload: NotRequired[bool]
-    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
-    image_folder: NotRequired[Literal["input", "output", "temp"]]
-    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
-    """
-    remote: NotRequired[RemoteInputOptions]
-    """Specifies the configuration for a remote input.
-    Available after ComfyUI frontend v1.9.7
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
-    control_after_generate: NotRequired[bool]
-    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
-    options: NotRequired[list[str | int | float]]
-    """COMBO type only. Specifies the selectable options for the combo widget.
-    Prefer:
-    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
-    Over:
-    [["Option 1", "Option 2", "Option 3"]]
-    """
-    multi_select: NotRequired[MultiSelectOptions]
-    """COMBO type only. Specifies the configuration for a multi-select widget.
-    Available after ComfyUI frontend v1.13.4
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""


 class HiddenInputTypeDict(TypedDict):
    """Provides type hinting for the hidden entry of node INPUT_TYPES."""

-    node_id: NotRequired[Literal["UNIQUE_ID"]]
+    node_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    unique_id: NotRequired[Literal["UNIQUE_ID"]]
+    unique_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    prompt: NotRequired[Literal["PROMPT"]]
+    prompt: Literal["PROMPT"]
    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
-    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
+    extra_pnginfo: Literal["EXTRA_PNGINFO"]
    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
-    dynprompt: NotRequired[Literal["DYNPROMPT"]]
+    dynprompt: Literal["DYNPROMPT"]
    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""


 class InputTypeDict(TypedDict):
    """Provides type hinting for node INPUT_TYPES.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs
    """

-    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    required: dict[str, tuple[IO, InputTypeOptions]]
    """Describes all inputs that must be connected for the node to execute."""
-    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    optional: dict[str, tuple[IO, InputTypeOptions]]
    """Describes inputs which do not need to be connected."""
-    hidden: NotRequired[HiddenInputTypeDict]
+    hidden: HiddenInputTypeDict
    """Offers advanced functionality and server-client communication.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs
    """


 class ComfyNodeABC(ABC):
    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview
    """

    DESCRIPTION: str
@@ -230,14 +167,12 @@ class ComfyNodeABC(ABC):
    CATEGORY: str
    """The category of the node, as per the "Add Node" menu.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#category
    """
    EXPERIMENTAL: bool
    """Flags a node as experimental, informing users that it may change or not work as expected."""
    DEPRECATED: bool
    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
-    API_NODE: Optional[bool]
-    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""

    @classmethod
    @abstractmethod
@@ -246,9 +181,9 @@ class ComfyNodeABC(ABC):

        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
        * The ``optional`` key can be added to describe inputs which do not need to be connected.
-        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs

-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
+        Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#input-types
        """
        return {"required": {}}

@@ -263,7 +198,7 @@ class ComfyNodeABC(ABC):

    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#output-node
    """
    INPUT_IS_LIST: bool
    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
@@ -274,9 +209,9 @@ class ComfyNodeABC(ABC):

    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
    """
-    OUTPUT_IS_LIST: tuple[bool, ...]
+    OUTPUT_IS_LIST: tuple[bool]
    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.

    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
@@ -292,29 +227,29 @@ class ComfyNodeABC(ABC):
    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
    specifying which outputs which should be so treated.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
    """

-    RETURN_TYPES: tuple[IO, ...]
+    RETURN_TYPES: tuple[IO]
    """A tuple representing the outputs of this node.

    Usage::

        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-types
    """
-    RETURN_NAMES: tuple[str, ...]
+    RETURN_NAMES: tuple[str]
    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-names
    """
-    OUTPUT_TOOLTIPS: tuple[str, ...]
+    OUTPUT_TOOLTIPS: tuple[str]
    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
    FUNCTION: str
    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#function
    """


@@ -332,19 +267,8 @@ class CheckLazyMixin:
        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).

-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
+        Comfy Docs: https://docs.comfy.org/essentials/custom_node_lazy_evaluation#defining-check-lazy-status
        """

        need = [name for name in kwargs if kwargs[name] is None]
        return need
-
-
-class FileLocator(TypedDict):
-    """Provides type hinting for the file location"""
-
-    filename: str
-    """The filename of the file."""
-    subfolder: str
-    """The subfolder of the file."""
-    type: Literal["input", "output", "temp"]
-    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@@ -1,9 +1,11 @@
 import torch
 import math
 import comfy.utils
-import logging


+def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
+    return abs(a*b) // math.gcd(a, b)
+
 class CONDRegular:
    def __init__(self, cond):
        self.cond = cond
@@ -11,15 +13,12 @@ class CONDRegular:
    def _copy_with(self, cond):
        return self.__class__(cond)

-    def process_cond(self, batch_size, **kwargs):
-        return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size))
+    def process_cond(self, batch_size, device, **kwargs):
+        return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device))

    def can_concat(self, other):
        if self.cond.shape != other.cond.shape:
            return False
-        if self.cond.device != other.cond.device:
-            logging.warning("WARNING: conds not on same device, skipping concat.")
-            return False
        return True

    def concat(self, others):
@@ -28,19 +27,15 @@ class CONDRegular:
            conds.append(x.cond)
        return torch.cat(conds)

-    def size(self):
-        return list(self.cond.size())
-
-
 class CONDNoiseShape(CONDRegular):
-    def process_cond(self, batch_size, area, **kwargs):
+    def process_cond(self, batch_size, device, area, **kwargs):
        data = self.cond
        if area is not None:
            dims = len(area) // 2
            for i in range(dims):
                data = data.narrow(i + 2, area[i + dims], area[i])

-        return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size))
+        return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device))


 class CONDCrossAttn(CONDRegular):
@@ -51,13 +46,10 @@ class CONDCrossAttn(CONDRegular):
            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
                return False

-            mult_min = math.lcm(s1[1], s2[1])
+            mult_min = lcm(s1[1], s2[1])
            diff = mult_min // min(s1[1], s2[1])
            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
                return False
-        if self.cond.device != other.cond.device:
-            logging.warning("WARNING: conds not on same device: skipping concat.")
-            return False
        return True

    def concat(self, others):
@@ -65,7 +57,7 @@ class CONDCrossAttn(CONDRegular):
        crossattn_max_len = self.cond.shape[1]
        for x in others:
            c = x.cond
-            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
+            crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
            conds.append(c)

        out = []
@@ -75,12 +67,11 @@ class CONDCrossAttn(CONDRegular):
            out.append(c)
        return torch.cat(out)

-
 class CONDConstant(CONDRegular):
    def __init__(self, cond):
        self.cond = cond

-    def process_cond(self, batch_size, **kwargs):
+    def process_cond(self, batch_size, device, **kwargs):
        return self._copy_with(self.cond)

    def can_concat(self, other):
@@ -90,48 +81,3 @@ class CONDConstant(CONDRegular):

    def concat(self, others):
        return self.cond
-
-    def size(self):
-        return [1]
-
-
-class CONDList(CONDRegular):
-    def __init__(self, cond):
-        self.cond = cond
-
-    def process_cond(self, batch_size, **kwargs):
-        out = []
-        for c in self.cond:
-            out.append(comfy.utils.repeat_to_batch_size(c, batch_size))
-
-        return self._copy_with(out)
-
-    def can_concat(self, other):
-        if len(self.cond) != len(other.cond):
-            return False
-        for i in range(len(self.cond)):
-            if self.cond[i].shape != other.cond[i].shape:
-                return False
-
-        return True
-
-    def concat(self, others):
-        out = []
-        for i in range(len(self.cond)):
-            o = [self.cond[i]]
-            for x in others:
-                o.append(x.cond[i])
-            out.append(torch.cat(o))
-
-        return out
-
-    def size(self):  # hackish implementation to make the mem estimation work
-        o = 0
-        c = 1
-        for c in self.cond:
-            size = c.size()
-            o += math.prod(size)
-            if len(size) > 1:
-                c = size[1]
-
-        return [1, c, o // c]
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@@ -1,540 +0,0 @@
-from __future__ import annotations
-from typing import TYPE_CHECKING, Callable
-import torch
-import numpy as np
-import collections
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-import logging
-import comfy.model_management
-import comfy.patcher_extension
-if TYPE_CHECKING:
-    from comfy.model_base import BaseModel
-    from comfy.model_patcher import ModelPatcher
-    from comfy.controlnet import ControlBase
-
-
-class ContextWindowABC(ABC):
-    def __init__(self):
-        ...
-
-    @abstractmethod
-    def get_tensor(self, full: torch.Tensor) -> torch.Tensor:
-        """
-        Get torch.Tensor applicable to current window.
-        """
-        raise NotImplementedError("Not implemented.")
-
-    @abstractmethod
-    def add_window(self, full: torch.Tensor, to_add: torch.Tensor) -> torch.Tensor:
-        """
-        Apply torch.Tensor of window to the full tensor, in place. Returns reference to updated full tensor, not a copy.
-        """
-        raise NotImplementedError("Not implemented.")
-
-class ContextHandlerABC(ABC):
-    def __init__(self):
-        ...
-
-    @abstractmethod
-    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
-        raise NotImplementedError("Not implemented.")
-
-    @abstractmethod
-    def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: ContextWindowABC, device=None) -> list:
-        raise NotImplementedError("Not implemented.")
-
-    @abstractmethod
-    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
-        raise NotImplementedError("Not implemented.")
-
-
-
-class IndexListContextWindow(ContextWindowABC):
-    def __init__(self, index_list: list[int], dim: int=0):
-        self.index_list = index_list
-        self.context_length = len(index_list)
-        self.dim = dim
-
-    def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
-        if dim is None:
-            dim = self.dim
-        if dim == 0 and full.shape[dim] == 1:
-            return full
-        idx = [slice(None)] * dim + [self.index_list]
-        return full[idx].to(device)
-
-    def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
-        if dim is None:
-            dim = self.dim
-        idx = [slice(None)] * dim + [self.index_list]
-        full[idx] += to_add
-        return full
-
-
-class IndexListCallbacks:
-    EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
-    COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
-    EXECUTE_START = "execute_start"
-    EXECUTE_CLEANUP = "execute_cleanup"
-
-    def init_callbacks(self):
-        return {}
-
-
-@dataclass
-class ContextSchedule:
-    name: str
-    func: Callable
-
-@dataclass
-class ContextFuseMethod:
-    name: str
-    func: Callable
-
-ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
-class IndexListContextHandler(ContextHandlerABC):
-    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0):
-        self.context_schedule = context_schedule
-        self.fuse_method = fuse_method
-        self.context_length = context_length
-        self.context_overlap = context_overlap
-        self.context_stride = context_stride
-        self.closed_loop = closed_loop
-        self.dim = dim
-        self._step = 0
-
-        self.callbacks = {}
-
-    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
-        # for now, assume first dim is batch - should have stored on BaseModel in actual implementation
-        if x_in.size(self.dim) > self.context_length:
-            logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
-            return True
-        return False
-
-    def prepare_control_objects(self, control: ControlBase, device=None) -> ControlBase:
-        if control.previous_controlnet is not None:
-            self.prepare_control_objects(control.previous_controlnet, device)
-        return control
-
-    def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: IndexListContextWindow, device=None) -> list:
-        if cond_in is None:
-            return None
-        # reuse or resize cond items to match context requirements
-        resized_cond = []
-        # cond object is a list containing a dict - outer list is irrelevant, so just loop through it
-        for actual_cond in cond_in:
-            resized_actual_cond = actual_cond.copy()
-            # now we are in the inner dict - "pooled_output" is a tensor, "control" is a ControlBase object, "model_conds" is dictionary
-            for key in actual_cond:
-                try:
-                    cond_item = actual_cond[key]
-                    if isinstance(cond_item, torch.Tensor):
-                        # check that tensor is the expected length - x.size(0)
-                        if self.dim < cond_item.ndim and cond_item.size(self.dim) == x_in.size(self.dim):
-                            # if so, it's subsetting time - tell controls the expected indeces so they can handle them
-                            actual_cond_item = window.get_tensor(cond_item)
-                            resized_actual_cond[key] = actual_cond_item.to(device)
-                        else:
-                            resized_actual_cond[key] = cond_item.to(device)
-                    # look for control
-                    elif key == "control":
-                        resized_actual_cond[key] = self.prepare_control_objects(cond_item, device)
-                    elif isinstance(cond_item, dict):
-                        new_cond_item = cond_item.copy()
-                        # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
-                        for cond_key, cond_value in new_cond_item.items():
-                            if isinstance(cond_value, torch.Tensor):
-                                if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim):
-                                    new_cond_item[cond_key] = window.get_tensor(cond_value, device)
-                            # if has cond that is a Tensor, check if needs to be subset
-                            elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-                                if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim):
-                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
-                            elif cond_key == "num_video_frames": # for SVD
-                                new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
-                                new_cond_item[cond_key].cond = window.context_length
-                        resized_actual_cond[key] = new_cond_item
-                    else:
-                        resized_actual_cond[key] = cond_item
-                finally:
-                    del cond_item  # just in case to prevent VRAM issues
-            resized_cond.append(resized_actual_cond)
-        return resized_cond
-
-    def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
-        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
-        matches = torch.nonzero(mask)
-        if torch.numel(matches) == 0:
-            raise Exception("No sample_sigmas matched current timestep; something went wrong.")
-        self._step = int(matches[0].item())
-
-    def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
-        full_length = x_in.size(self.dim) # TODO: choose dim based on model
-        context_windows = self.context_schedule.func(full_length, self, model_options)
-        context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
-        return context_windows
-
-    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
-        self.set_step(timestep, model_options)
-        context_windows = self.get_context_windows(model, x_in, model_options)
-        enumerated_context_windows = list(enumerate(context_windows))
-
-        conds_final = [torch.zeros_like(x_in) for _ in conds]
-        if self.fuse_method.name == ContextFuseMethods.RELATIVE:
-            counts_final = [torch.ones(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds]
-        else:
-            counts_final = [torch.zeros(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds]
-        biases_final = [([0.0] * x_in.shape[self.dim]) for _ in conds]
-
-        for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_START, self.callbacks):
-            callback(self, model, x_in, conds, timestep, model_options)
-
-        for enum_window in enumerated_context_windows:
-            results = self.evaluate_context_windows(calc_cond_batch, model, x_in, conds, timestep, [enum_window], model_options)
-            for result in results:
-                self.combine_context_window_results(x_in, result.sub_conds_out, result.sub_conds, result.window, result.window_idx, len(enumerated_context_windows), timestep,
-                                            conds_final, counts_final, biases_final)
-        try:
-            # finalize conds
-            if self.fuse_method.name == ContextFuseMethods.RELATIVE:
-                # relative is already normalized, so return as is
-                del counts_final
-                return conds_final
-            else:
-                # normalize conds via division by context usage counts
-                for i in range(len(conds_final)):
-                    conds_final[i] /= counts_final[i]
-                del counts_final
-                return conds_final
-        finally:
-            for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_CLEANUP, self.callbacks):
-                callback(self, model, x_in, conds, timestep, model_options)
-
-    def evaluate_context_windows(self, calc_cond_batch: Callable, model: BaseModel, x_in: torch.Tensor, conds, timestep: torch.Tensor, enumerated_context_windows: list[tuple[int, IndexListContextWindow]],
-                                model_options, device=None, first_device=None):
-        results: list[ContextResults] = []
-        for window_idx, window in enumerated_context_windows:
-            # allow processing to end between context window executions for faster Cancel
-            comfy.model_management.throw_exception_if_processing_interrupted()
-
-            for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
-                callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)
-
-            # update exposed params
-            model_options["transformer_options"]["context_window"] = window
-            # get subsections of x, timestep, conds
-            sub_x = window.get_tensor(x_in, device)
-            sub_timestep = window.get_tensor(timestep, device, dim=0)
-            sub_conds = [self.get_resized_cond(cond, x_in, window, device) for cond in conds]
-
-            sub_conds_out = calc_cond_batch(model, sub_conds, sub_x, sub_timestep, model_options)
-            if device is not None:
-                for i in range(len(sub_conds_out)):
-                    sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
-            results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
-        return results
-
-
-    def combine_context_window_results(self, x_in: torch.Tensor, sub_conds_out, sub_conds, window: IndexListContextWindow, window_idx: int, total_windows: int, timestep: torch.Tensor,
-                                    conds_final: list[torch.Tensor], counts_final: list[torch.Tensor], biases_final: list[torch.Tensor]):
-        if self.fuse_method.name == ContextFuseMethods.RELATIVE:
-            for pos, idx in enumerate(window.index_list):
-                # bias is the influence of a specific index in relation to the whole context window
-                bias = 1 - abs(idx - (window.index_list[0] + window.index_list[-1]) / 2) / ((window.index_list[-1] - window.index_list[0] + 1e-2) / 2)
-                bias = max(1e-2, bias)
-                # take weighted average relative to total bias of current idx
-                for i in range(len(sub_conds_out)):
-                    bias_total = biases_final[i][idx]
-                    prev_weight = (bias_total / (bias_total + bias))
-                    new_weight = (bias / (bias_total + bias))
-                    # account for dims of tensors
-                    idx_window = [slice(None)] * self.dim + [idx]
-                    pos_window = [slice(None)] * self.dim + [pos]
-                    # apply new values
-                    conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
-                    biases_final[i][idx] = bias_total + bias
-        else:
-            # add conds and counts based on weights of fuse method
-            weights = get_context_weights(window.context_length, x_in.shape[self.dim], window.index_list, self, sigma=timestep)
-            weights_tensor = match_weights_to_dim(weights, x_in, self.dim, device=x_in.device)
-            for i in range(len(sub_conds_out)):
-                window.add_window(conds_final[i], sub_conds_out[i] * weights_tensor)
-                window.add_window(counts_final[i], weights_tensor)
-
-        for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.COMBINE_CONTEXT_WINDOW_RESULTS, self.callbacks):
-            callback(self, x_in, sub_conds_out, sub_conds, window, window_idx, total_windows, timestep, conds_final, counts_final, biases_final)
-
-
-def _prepare_sampling_wrapper(executor, model, noise_shape: torch.Tensor, *args, **kwargs):
-    # limit noise_shape length to context_length for more accurate vram use estimation
-    model_options = kwargs.get("model_options", None)
-    if model_options is None:
-        raise Exception("model_options not found in prepare_sampling_wrapper; this should never happen, something went wrong.")
-    handler: IndexListContextHandler = model_options.get("context_handler", None)
-    if handler is not None:
-        noise_shape = list(noise_shape)
-        noise_shape[handler.dim] = min(noise_shape[handler.dim], handler.context_length)
-    return executor(model, noise_shape, *args, **kwargs)
-
-
-def create_prepare_sampling_wrapper(model: ModelPatcher):
-    model.add_wrapper_with_key(
-        comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING,
-        "ContextWindows_prepare_sampling",
-        _prepare_sampling_wrapper
-    )
-
-
-def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
-    total_dims = len(x_in.shape)
-    weights_tensor = torch.Tensor(weights).to(device=device)
-    for _ in range(dim):
-        weights_tensor = weights_tensor.unsqueeze(0)
-    for _ in range(total_dims - dim - 1):
-        weights_tensor = weights_tensor.unsqueeze(-1)
-    return weights_tensor
-
-def get_shape_for_dim(x_in: torch.Tensor, dim: int) -> list[int]:
-    total_dims = len(x_in.shape)
-    shape = []
-    for _ in range(dim):
-        shape.append(1)
-    shape.append(x_in.shape[dim])
-    for _ in range(total_dims - dim - 1):
-        shape.append(1)
-    return shape
-
-class ContextSchedules:
-    UNIFORM_LOOPED = "looped_uniform"
-    UNIFORM_STANDARD = "standard_uniform"
-    STATIC_STANDARD = "standard_static"
-    BATCHED = "batched"
-
-
-# from https://github.com/neggles/animatediff-cli/blob/main/src/animatediff/pipelines/context.py
-def create_windows_uniform_looped(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
-    windows = []
-    if num_frames < handler.context_length:
-        windows.append(list(range(num_frames)))
-        return windows
-
-    context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1)
-    # obtain uniform windows as normal, looping and all
-    for context_step in 1 << np.arange(context_stride):
-        pad = int(round(num_frames * ordered_halving(handler._step)))
-        for j in range(
-            int(ordered_halving(handler._step) * context_step) + pad,
-            num_frames + pad + (0 if handler.closed_loop else -handler.context_overlap),
-            (handler.context_length * context_step - handler.context_overlap),
-        ):
-            windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)])
-
-    return windows
-
-def create_windows_uniform_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
-    # unlike looped, uniform_straight does NOT allow windows that loop back to the beginning;
-    # instead, they get shifted to the corresponding end of the frames.
-    # in the case that a window (shifted or not) is identical to the previous one, it gets skipped.
-    windows = []
-    if num_frames <= handler.context_length:
-        windows.append(list(range(num_frames)))
-        return windows
-
-    context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1)
-    # first, obtain uniform windows as normal, looping and all
-    for context_step in 1 << np.arange(context_stride):
-        pad = int(round(num_frames * ordered_halving(handler._step)))
-        for j in range(
-            int(ordered_halving(handler._step) * context_step) + pad,
-            num_frames + pad + (-handler.context_overlap),
-            (handler.context_length * context_step - handler.context_overlap),
-        ):
-            windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)])
-
-    # now that windows are created, shift any windows that loop, and delete duplicate windows
-    delete_idxs = []
-    win_i = 0
-    while win_i < len(windows):
-        # if window is rolls over itself, need to shift it
-        is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames)
-        if is_roll:
-            roll_val = windows[win_i][roll_idx]  # roll_val might not be 0 for windows of higher strides
-            shift_window_to_end(windows[win_i], num_frames=num_frames)
-            # check if next window (cyclical) is missing roll_val
-            if roll_val not in windows[(win_i+1) % len(windows)]:
-                # need to insert new window here - just insert window starting at roll_val
-                windows.insert(win_i+1, list(range(roll_val, roll_val + handler.context_length)))
-        # delete window if it's not unique
-        for pre_i in range(0, win_i):
-            if windows[win_i] == windows[pre_i]:
-                delete_idxs.append(win_i)
-                break
-        win_i += 1
-
-    # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation
-    delete_idxs.reverse()
-    for i in delete_idxs:
-        windows.pop(i)
-
-    return windows
-
-
-def create_windows_static_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
-    windows = []
-    if num_frames <= handler.context_length:
-        windows.append(list(range(num_frames)))
-        return windows
-    # always return the same set of windows
-    delta = handler.context_length - handler.context_overlap
-    for start_idx in range(0, num_frames, delta):
-        # if past the end of frames, move start_idx back to allow same context_length
-        ending = start_idx + handler.context_length
-        if ending >= num_frames:
-            final_delta = ending - num_frames
-            final_start_idx = start_idx - final_delta
-            windows.append(list(range(final_start_idx, final_start_idx + handler.context_length)))
-            break
-        windows.append(list(range(start_idx, start_idx + handler.context_length)))
-    return windows
-
-
-def create_windows_batched(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
-    windows = []
-    if num_frames <= handler.context_length:
-        windows.append(list(range(num_frames)))
-        return windows
-    # always return the same set of windows;
-    # no overlap, just cut up based on context_length;
-    # last window size will be different if num_frames % opts.context_length != 0
-    for start_idx in range(0, num_frames, handler.context_length):
-        windows.append(list(range(start_idx, min(start_idx + handler.context_length, num_frames))))
-    return windows
-
-
-def create_windows_default(num_frames: int, handler: IndexListContextHandler):
-    return [list(range(num_frames))]
-
-
-CONTEXT_MAPPING = {
-    ContextSchedules.UNIFORM_LOOPED: create_windows_uniform_looped,
-    ContextSchedules.UNIFORM_STANDARD: create_windows_uniform_standard,
-    ContextSchedules.STATIC_STANDARD: create_windows_static_standard,
-    ContextSchedules.BATCHED: create_windows_batched,
-}
-
-
-def get_matching_context_schedule(context_schedule: str) -> ContextSchedule:
-    func = CONTEXT_MAPPING.get(context_schedule, None)
-    if func is None:
-        raise ValueError(f"Unknown context_schedule '{context_schedule}'.")
-    return ContextSchedule(context_schedule, func)
-
-
-def get_context_weights(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, sigma: torch.Tensor=None):
-    return handler.fuse_method.func(length, sigma=sigma, handler=handler, full_length=full_length, idxs=idxs)
-
-
-def create_weights_flat(length: int, **kwargs) -> list[float]:
-    # weight is the same for all
-    return [1.0] * length
-
-def create_weights_pyramid(length: int, **kwargs) -> list[float]:
-    # weight is based on the distance away from the edge of the context window;
-    # based on weighted average concept in FreeNoise paper
-    if length % 2 == 0:
-        max_weight = length // 2
-        weight_sequence = list(range(1, max_weight + 1, 1)) + list(range(max_weight, 0, -1))
-    else:
-        max_weight = (length + 1) // 2
-        weight_sequence = list(range(1, max_weight, 1)) + [max_weight] + list(range(max_weight - 1, 0, -1))
-    return weight_sequence
-
-def create_weights_overlap_linear(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, **kwargs):
-    # based on code in Kijai's WanVideoWrapper: https://github.com/kijai/ComfyUI-WanVideoWrapper/blob/dbb2523b37e4ccdf45127e5ae33e31362f755c8e/nodes.py#L1302
-    # only expected overlap is given different weights
-    weights_torch = torch.ones((length))
-    # blend left-side on all except first window
-    if min(idxs) > 0:
-        ramp_up = torch.linspace(1e-37, 1, handler.context_overlap)
-        weights_torch[:handler.context_overlap] = ramp_up
-    # blend right-side on all except last window
-    if max(idxs) < full_length-1:
-        ramp_down = torch.linspace(1, 1e-37, handler.context_overlap)
-        weights_torch[-handler.context_overlap:] = ramp_down
-    return weights_torch
-
-class ContextFuseMethods:
-    FLAT = "flat"
-    PYRAMID = "pyramid"
-    RELATIVE = "relative"
-    OVERLAP_LINEAR = "overlap-linear"
-
-    LIST = [PYRAMID, FLAT, OVERLAP_LINEAR]
-    LIST_STATIC = [PYRAMID, RELATIVE, FLAT, OVERLAP_LINEAR]
-
-
-FUSE_MAPPING = {
-    ContextFuseMethods.FLAT: create_weights_flat,
-    ContextFuseMethods.PYRAMID: create_weights_pyramid,
-    ContextFuseMethods.RELATIVE: create_weights_pyramid,
-    ContextFuseMethods.OVERLAP_LINEAR: create_weights_overlap_linear,
-}
-
-def get_matching_fuse_method(fuse_method: str) -> ContextFuseMethod:
-    func = FUSE_MAPPING.get(fuse_method, None)
-    if func is None:
-        raise ValueError(f"Unknown fuse_method '{fuse_method}'.")
-    return ContextFuseMethod(fuse_method, func)
-
-# Returns fraction that has denominator that is a power of 2
-def ordered_halving(val):
-    # get binary value, padded with 0s for 64 bits
-    bin_str = f"{val:064b}"
-    # flip binary value, padding included
-    bin_flip = bin_str[::-1]
-    # convert binary to int
-    as_int = int(bin_flip, 2)
-    # divide by 1 << 64, equivalent to 2**64, or 18446744073709551616,
-    # or b10000000000000000000000000000000000000000000000000000000000000000 (1 with 64 zero's)
-    return as_int / (1 << 64)
-
-
-def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]:
-    all_indexes = list(range(num_frames))
-    for w in windows:
-        for val in w:
-            try:
-                all_indexes.remove(val)
-            except ValueError:
-                pass
-    return all_indexes
-
-
-def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]:
-    prev_val = -1
-    for i, val in enumerate(window):
-        val = val % num_frames
-        if val < prev_val:
-            return True, i
-        prev_val = val
-    return False, -1
-
-
-def shift_window_to_start(window: list[int], num_frames: int):
-    start_val = window[0]
-    for i in range(len(window)):
-        # 1) subtract each element by start_val to move vals relative to the start of all frames
-        # 2) add num_frames and take modulus to get adjusted vals
-        window[i] = ((window[i] - start_val) + num_frames) % num_frames
-
-
-def shift_window_to_end(window: list[int], num_frames: int):
-    # 1) shift window to start
-    shift_window_to_start(window, num_frames)
-    end_val = window[-1]
-    end_delta = num_frames - end_val - 1
-    for i in range(len(window)):
-        # 2) add end_delta to each val to slide windows to end
-        window[i] = window[i] + end_delta
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -28,7 +28,6 @@ import comfy.model_detection
 import comfy.model_patcher
 import comfy.ops
 import comfy.latent_formats
-import comfy.model_base

 import comfy.cldm.cldm
 import comfy.t2i_adapter.adapter
@@ -36,7 +35,6 @@ import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
-import comfy.ldm.qwen_image.controlnet
 import comfy.cldm.dit_embedder
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@@ -45,6 +43,7 @@ if TYPE_CHECKING:

 def broadcast_image_to(tensor, target_batch_size, batched_number):
    current_batch_size = tensor.shape[0]
+    #print(current_batch_size, target_batch_size)
    if current_batch_size == 1:
        return tensor

@@ -237,11 +236,11 @@ class ControlNet(ControlBase):
            self.cond_hint = None
            compression_ratio = self.compression_ratio
            if self.vae is not None:
-                compression_ratio *= self.vae.spacial_compression_encode()
+                compression_ratio *= self.vae.downscale_ratio
            else:
                if self.latent_format is not None:
                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
-            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[-1] * compression_ratio, x_noisy.shape[-2] * compression_ratio, self.upscale_algorithm, "center")
+            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
            self.cond_hint = self.preprocess_image(self.cond_hint)
            if self.vae is not None:
                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
@@ -266,12 +265,12 @@ class ControlNet(ControlBase):
        for c in self.extra_conds:
            temp = cond.get(c, None)
            if temp is not None:
-                extra[c] = comfy.model_base.convert_tensor(temp, dtype, x_noisy.device)
+                extra[c] = temp.to(dtype)

        timestep = self.model_sampling_current.timestep(t)
        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)

-        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=comfy.model_management.cast_to_device(context, x_noisy.device, dtype), **extra)
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
        return self.control_merge(control, control_prev, output_dtype=None)

    def copy(self):
@@ -391,9 +390,8 @@ class ControlLora(ControlNet):
                pass

        for k in self.control_weights:
-            if (k not in {"lora_controlnet"}):
-                if (k.endswith(".up") or k.endswith(".down") or k.endswith(".weight") or k.endswith(".bias")) and ("__" not in k):
-                    comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
+            if k not in {"lora_controlnet"}:
+                comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))

    def copy(self):
        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
@@ -420,7 +418,10 @@ def controlnet_config(sd, model_options={}):
        weight_dtype = comfy.utils.weight_dtype(sd)

        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)

    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
@@ -583,15 +584,6 @@ def load_controlnet_flux_instantx(sd, model_options={}):
    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
    return control

-def load_controlnet_qwen_instantx(sd, model_options={}):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
-    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    control_model = controlnet_load_state_dict(control_model, sd)
-    latent_format = comfy.latent_formats.Wan21()
-    extra_conds = []
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
-    return control
-
 def convert_mistoline(sd):
    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})

@@ -665,11 +657,8 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
                return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
            else:
                return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
-        elif "transformer_blocks.0.img_mlp.net.0.proj.weight" in controlnet_data:
-            return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options)
        elif "controlnet_x_embedder.weight" in controlnet_data:
            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
-
    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)

@@ -700,7 +689,10 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
        if supported_inference_dtypes is None:
            supported_inference_dtypes = [comfy.model_management.unet_dtype()]

-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)

    load_device = comfy.model_management.get_torch_device()

@@ -750,7 +742,6 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    return control

 def load_controlnet(ckpt_path, model=None, model_options={}):
-    model_options = model_options.copy()
    if "global_average_pooling" not in model_options:
        filename = os.path.splitext(ckpt_path)[0]
        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
@@ -4,6 +4,105 @@ import logging

 # conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py

+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2 * j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
 # ================#
 # VAE Conversion #
 # ================#
@@ -114,7 +213,6 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
 code2idx = {"q": 0, "k": 1, "v": 2}

-
 # This function exists because at the time of writing torch.cat can't do fp8 with cuda
 def cat_tensors(tensors):
    x = 0
@@ -131,7 +229,6 @@ def cat_tensors(tensors):

    return out

-
 def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
    new_state_dict = {}
    capture_qkv_weight = {}
@@ -187,3 +284,5 @@ def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):

 def convert_text_enc_state_dict(text_enc_dict):
    return text_enc_dict
+
+
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -661,7 +661,7 @@ class UniPC:

            if x_t is None:
                if use_predictor:
-                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
                else:
                    pred_res = 0
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
@@ -669,7 +669,7 @@ class UniPC:
            if use_corrector:
                model_t = self.model_fn(x_t, t)
                if D1s is not None:
-                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
                else:
                    corr_res = 0
                D1_t = (model_t - model_prev_0)
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
@@ -1,10 +1,55 @@
 import math
 import torch
 from torch import nn
-from .ldm.modules.attention import CrossAttention, FeedForward
+from .ldm.modules.attention import CrossAttention
+from inspect import isfunction
 import comfy.ops
 ops = comfy.ops.manual_cast

+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = ops.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * torch.nn.functional.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            ops.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            ops.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+

 class GatedCrossAttentionDense(nn.Module):
    def __init__(self, query_dim, context_dim, n_heads, d_head):
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@@ -1,141 +0,0 @@
-import torch
-from comfy.text_encoders.bert import BertAttention
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention_for_device
-
-
-class Dino2AttentionOutput(torch.nn.Module):
-    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.dense(x)
-
-
-class Dino2AttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
-        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-
-    def forward(self, x, mask, optimized_attention):
-        return self.output(self.attention(x, mask, optimized_attention))
-
-
-class LayerScale(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
-
-    def forward(self, x):
-        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
-
-
-class SwiGLUFFN(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        in_features = out_features = dim
-        hidden_features = int(dim * 4)
-        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
-
-        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
-        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.weights_in(x)
-        x1, x2 = x.chunk(2, dim=-1)
-        x = torch.nn.functional.silu(x1) * x2
-        return self.weights_out(x)
-
-
-class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
-        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
-        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
-        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
-        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, x, optimized_attention):
-        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
-        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
-        return x
-
-
-class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
-        super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
-
-    def forward(self, x, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
-
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layer) + intermediate_output
-
-        intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, optimized_attention)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-
-class Dino2PatchEmbeddings(torch.nn.Module):
-    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.projection = operations.Conv2d(
-            in_channels=num_channels,
-            out_channels=dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=True,
-            dtype=dtype,
-            device=device
-        )
-
-    def forward(self, pixel_values):
-        return self.projection(pixel_values).flatten(2).transpose(1, 2)
-
-
-class Dino2Embeddings(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        patch_size = 14
-        image_size = 518
-
-        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
-        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
-        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
-        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
-
-    def forward(self, pixel_values):
-        x = self.patch_embeddings(pixel_values)
-        # TODO: mask_token?
-        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
-        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
-        return x
-
-
-class Dinov2Model(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        num_layers = config_dict["num_hidden_layers"]
-        dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        layer_norm_eps = config_dict["layer_norm_eps"]
-
-        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
-        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
-        x = self.embeddings(pixel_values)
-        x, i = self.encoder(x, intermediate_output=intermediate_output)
-        x = self.layernorm(x)
-        pooled_output = x[:, 0, :]
-        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@@ -1,21 +0,0 @@
-{
-  "attention_probs_dropout_prob": 0.0,
-  "drop_path_rate": 0.0,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 1536,
-  "image_size": 518,
-  "initializer_range": 0.02,
-  "layer_norm_eps": 1e-06,
-  "layerscale_value": 1.0,
-  "mlp_ratio": 4,
-  "model_type": "dinov2",
-  "num_attention_heads": 24,
-  "num_channels": 3,
-  "num_hidden_layers": 40,
-  "patch_size": 14,
-  "qkv_bias": true,
-  "use_swiglu_ffn": true,
-  "image_mean": [0.485, 0.456, 0.406],
-  "image_std": [0.229, 0.224, 0.225]
-}
--- a/comfy/k_diffusion/sa_solver.py
+++ b/comfy/k_diffusion/sa_solver.py
@@ -1,121 +0,0 @@
-# SA-Solver: Stochastic Adams Solver (NeurIPS 2023, arXiv:2309.05019)
-# Conference: https://proceedings.neurips.cc/paper_files/paper/2023/file/f4a6806490d31216a3ba667eb240c897-Paper-Conference.pdf
-# Codebase ref: https://github.com/scxue/SA-Solver
-
-import math
-from typing import Union, Callable
-import torch
-
-
-def compute_exponential_coeffs(s: torch.Tensor, t: torch.Tensor, solver_order: int, tau_t: float) -> torch.Tensor:
-    """Compute (1 + tau^2) * integral of exp((1 + tau^2) * x) * x^p dx from s to t with exp((1 + tau^2) * t) factored out, using integration by parts.
-
-    Integral of exp((1 + tau^2) * x) * x^p dx
-        = product_terms[p] - (p / (1 + tau^2)) * integral of exp((1 + tau^2) * x) * x^(p-1) dx,
-    with base case p=0 where integral equals product_terms[0].
-
-    where
-        product_terms[p] = x^p * exp((1 + tau^2) * x) / (1 + tau^2).
-
-    Construct a recursive coefficient matrix following the above recursive relation to compute all integral terms up to p = (solver_order - 1).
-    Return coefficients used by the SA-Solver in data prediction mode.
-
-    Args:
-        s: Start time s.
-        t: End time t.
-        solver_order: Current order of the solver.
-        tau_t: Stochastic strength parameter in the SDE.
-
-    Returns:
-        Exponential coefficients used in data prediction, with exp((1 + tau^2) * t) factored out, ordered from p=0 to p=solver_order−1, shape (solver_order,).
-    """
-    tau_mul = 1 + tau_t ** 2
-    h = t - s
-    p = torch.arange(solver_order, dtype=s.dtype, device=s.device)
-
-    # product_terms after factoring out exp((1 + tau^2) * t)
-    # Includes (1 + tau^2) factor from outside the integral
-    product_terms_factored = (t ** p - s ** p * (-tau_mul * h).exp())
-
-    # Lower triangular recursive coefficient matrix
-    # Accumulates recursive coefficients based on p / (1 + tau^2)
-    recursive_depth_mat = p.unsqueeze(1) - p.unsqueeze(0)
-    log_factorial = (p + 1).lgamma()
-    recursive_coeff_mat = log_factorial.unsqueeze(1) - log_factorial.unsqueeze(0)
-    if tau_t > 0:
-        recursive_coeff_mat = recursive_coeff_mat - (recursive_depth_mat * math.log(tau_mul))
-    signs = torch.where(recursive_depth_mat % 2 == 0, 1.0, -1.0)
-    recursive_coeff_mat = (recursive_coeff_mat.exp() * signs).tril()
-
-    return recursive_coeff_mat @ product_terms_factored
-
-
-def compute_simple_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, is_corrector_step: bool = False) -> torch.Tensor:
-    """Compute simple order-2 b coefficients from SA-Solver paper (Appendix D. Implementation Details)."""
-    tau_mul = 1 + tau_t ** 2
-    h = lambda_t - lambda_s
-    alpha_t = sigma_next * lambda_t.exp()
-    if is_corrector_step:
-        # Simplified 1-step (order-2) corrector
-        b_1 = alpha_t * (0.5 * tau_mul * h)
-        b_2 = alpha_t * (-h * tau_mul).expm1().neg() - b_1
-    else:
-        # Simplified 2-step predictor
-        b_2 = alpha_t * (0.5 * tau_mul * h ** 2) / (curr_lambdas[-2] - lambda_s)
-        b_1 = alpha_t * (-h * tau_mul).expm1().neg() - b_2
-    return torch.stack([b_2, b_1])
-
-
-def compute_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, simple_order_2: bool = False, is_corrector_step: bool = False) -> torch.Tensor:
-    """Compute b_i coefficients for the SA-Solver (see eqs. 15 and 18).
-
-    The solver order corresponds to the number of input lambdas (half-logSNR points).
-
-    Args:
-        sigma_next: Sigma at end time t.
-        curr_lambdas: Lambda time points used to construct the Lagrange basis, shape (N,).
-        lambda_s: Lambda at start time s.
-        lambda_t: Lambda at end time t.
-        tau_t: Stochastic strength parameter in the SDE.
-        simple_order_2: Whether to enable the simple order-2 scheme.
-        is_corrector_step: Flag for corrector step in simple order-2 mode.
-
-    Returns:
-        b_i coefficients for the SA-Solver, shape (N,), where N is the solver order.
-    """
-    num_timesteps = curr_lambdas.shape[0]
-
-    if simple_order_2 and num_timesteps == 2:
-        return compute_simple_stochastic_adams_b_coeffs(sigma_next, curr_lambdas, lambda_s, lambda_t, tau_t, is_corrector_step)
-
-    # Compute coefficients by solving a linear system from Lagrange basis interpolation
-    exp_integral_coeffs = compute_exponential_coeffs(lambda_s, lambda_t, num_timesteps, tau_t)
-    vandermonde_matrix_T = torch.vander(curr_lambdas, num_timesteps, increasing=True).T
-    lagrange_integrals = torch.linalg.solve(vandermonde_matrix_T, exp_integral_coeffs)
-
-    # (sigma_t * exp(-tau^2 * lambda_t)) * exp((1 + tau^2) * lambda_t)
-    # = sigma_t * exp(lambda_t) = alpha_t
-    # exp((1 + tau^2) * lambda_t) is extracted from the integral
-    alpha_t = sigma_next * lambda_t.exp()
-    return alpha_t * lagrange_integrals
-
-
-def get_tau_interval_func(start_sigma: float, end_sigma: float, eta: float = 1.0) -> Callable[[Union[torch.Tensor, float]], float]:
-    """Return a function that controls the stochasticity of SA-Solver.
-
-    When eta = 0, SA-Solver runs as ODE. The official approach uses
-    time t to determine the SDE interval, while here we use sigma instead.
-
-    See:
-        https://github.com/scxue/SA-Solver/blob/main/README.md
-    """
-
-    def tau_func(sigma: Union[torch.Tensor, float]) -> float:
-        if eta <= 0:
-            return 0.0  # ODE
-
-        if isinstance(sigma, torch.Tensor):
-            sigma = sigma.item()
-        return eta if start_sigma >= sigma >= end_sigma else 0.0
-
-    return tau_func
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -1,5 +1,4 @@
 import math
-from functools import partial

 from scipy import integrate
 import torch
@@ -9,7 +8,6 @@ from tqdm.auto import trange, tqdm

 from . import utils
 from . import deis
-from . import sa_solver
 import comfy.model_patcher
 import comfy.model_sampling

@@ -42,7 +40,7 @@ def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
 def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    """Constructs a continuous VP noise schedule."""
    t = torch.linspace(1, eps_s, n, device=device)
-    sigmas = torch.sqrt(torch.special.expm1(beta_d * t ** 2 / 2 + beta_min * t))
+    sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
    return append_zero(sigmas)


@@ -144,43 +142,6 @@ class BrownianTreeNoiseSampler:
        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()


-def sigma_to_half_log_snr(sigma, model_sampling):
-    """Convert sigma to half-logSNR log(alpha_t / sigma_t)."""
-    if isinstance(model_sampling, comfy.model_sampling.CONST):
-        # log((1 - t) / t) = log((1 - sigma) / sigma)
-        return sigma.logit().neg()
-    return sigma.log().neg()
-
-
-def half_log_snr_to_sigma(half_log_snr, model_sampling):
-    """Convert half-logSNR log(alpha_t / sigma_t) to sigma."""
-    if isinstance(model_sampling, comfy.model_sampling.CONST):
-        # 1 / (1 + exp(half_log_snr))
-        return half_log_snr.neg().sigmoid()
-    return half_log_snr.neg().exp()
-
-
-def offset_first_sigma_for_snr(sigmas, model_sampling, percent_offset=1e-4):
-    """Adjust the first sigma to avoid invalid logSNR."""
-    if len(sigmas) <= 1:
-        return sigmas
-    if isinstance(model_sampling, comfy.model_sampling.CONST):
-        if sigmas[0] >= 1:
-            sigmas = sigmas.clone()
-            sigmas[0] = model_sampling.percent_to_sigma(percent_offset)
-    return sigmas
-
-
-def ei_h_phi_1(h: torch.Tensor) -> torch.Tensor:
-    """Compute the result of h*phi_1(h) in exponential integrator methods."""
-    return torch.expm1(h)
-
-
-def ei_h_phi_2(h: torch.Tensor) -> torch.Tensor:
-    """Compute the result of h*phi_2(h) in exponential integrator methods."""
-    return (torch.expm1(h) - h) / h
-
-
@torch.no_grad()
 def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
    """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
@@ -423,13 +384,9 @@ def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, o
            ds.pop(0)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            cur_order = min(i + 1, order)
-            coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
-            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+        cur_order = min(i + 1, order)
+        coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
+        x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
    return x


@@ -725,61 +682,49 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
    return x

-
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
-
-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
+    sigma_fn = lambda t: t.neg().exp()
+    t_fn = lambda sigma: sigma.log().neg()

    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
+            # Euler method
+            d = to_d(x, sigmas[i], denoised)
+            dt = sigmas[i + 1] - sigmas[i]
+            x = x + d * dt
        else:
            # DPM-Solver++
-            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = lambda_t - lambda_s
-            lambda_s_1 = lambda_s + r * h
+            t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
+            h = t_next - t
+            s = t + h * r
            fac = 1 / (2 * r)

-            sigma_s_1 = sigma_fn(lambda_s_1)
-
-            alpha_s = sigmas[i] * lambda_s.exp()
-            alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
-            alpha_t = sigmas[i + 1] * lambda_t.exp()
-
            # Step 1
-            sd, su = get_ancestral_step(lambda_s.neg().exp(), lambda_s_1.neg().exp(), eta)
-            lambda_s_1_ = sd.log().neg()
-            h_ = lambda_s_1_ - lambda_s
-            x_2 = (alpha_s_1 / alpha_s) * (-h_).exp() * x - alpha_s_1 * (-h_).expm1() * denoised
-            if eta > 0 and s_noise > 0:
-                x_2 = x_2 + alpha_s_1 * noise_sampler(sigmas[i], sigma_s_1) * s_noise * su
-            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
+            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(s), eta)
+            s_ = t_fn(sd)
+            x_2 = (sigma_fn(s_) / sigma_fn(t)) * x - (t - s_).expm1() * denoised
+            x_2 = x_2 + noise_sampler(sigma_fn(t), sigma_fn(s)) * s_noise * su
+            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)

            # Step 2
-            sd, su = get_ancestral_step(lambda_s.neg().exp(), lambda_t.neg().exp(), eta)
-            lambda_t_ = sd.log().neg()
-            h_ = lambda_t_ - lambda_s
+            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(t_next), eta)
+            t_next_ = t_fn(sd)
            denoised_d = (1 - fac) * denoised + fac * denoised_2
-            x = (alpha_t / alpha_s) * (-h_).exp() * x - alpha_t * (-h_).expm1() * denoised_d
-            if eta > 0 and s_noise > 0:
-                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * su
+            x = (sigma_fn(t_next_) / sigma_fn(t)) * x - (t - t_next_).expm1() * denoised_d
+            x = x + noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * su
    return x


@@ -808,7 +753,6 @@ def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, disable=No
        old_denoised = denoised
    return x

-
@torch.no_grad()
 def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    """DPM-Solver++(2M) SDE."""
@@ -818,18 +762,15 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-
    old_denoised = None
-    h, h_last = None, None
+    h_last = None
+    h = None

    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@@ -840,34 +781,26 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
            x = denoised
        else:
            # DPM-Solver++(2M) SDE
-            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = lambda_t - lambda_s
-            h_eta = h * (eta + 1)
+            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = s - t
+            eta_h = eta * h

-            alpha_t = sigmas[i + 1] * lambda_t.exp()
-
-            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x + alpha_t * (-h_eta).expm1().neg() * denoised
+            x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised

            if old_denoised is not None:
                r = h_last / h
                if solver_type == 'heun':
-                    x = x + alpha_t * ((-h_eta).expm1().neg() / (-h_eta) + 1) * (1 / r) * (denoised - old_denoised)
+                    x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
                elif solver_type == 'midpoint':
-                    x = x + 0.5 * alpha_t * (-h_eta).expm1().neg() * (1 / r) * (denoised - old_denoised)
+                    x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)

-            if eta > 0 and s_noise > 0:
-                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise
+            if eta:
+                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise

        old_denoised = denoised
        h_last = h
    return x

-
-@torch.no_grad()
-def sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'):
-    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
-
-
@torch.no_grad()
 def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """DPM-Solver++(3M) SDE."""
@@ -875,16 +808,12 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-
    denoised_1, denoised_2 = None, None
    h, h_1, h_2 = None, None, None

@@ -896,16 +825,13 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
            # Denoising step
            x = denoised
        else:
-            lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-            h = lambda_t - lambda_s
+            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = s - t
            h_eta = h * (eta + 1)

-            alpha_t = sigmas[i + 1] * lambda_t.exp()
-
-            x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x + alpha_t * (-h_eta).expm1().neg() * denoised
+            x = torch.exp(-h_eta) * x + (-h_eta).expm1().neg() * denoised

            if h_2 is not None:
-                # DPM-Solver++(3M) SDE
                r0 = h_1 / h
                r1 = h_2 / h
                d1_0 = (denoised - denoised_1) / r0
@@ -914,57 +840,43 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
                d2 = (d1_0 - d1_1) / (r0 + r1)
                phi_2 = h_eta.neg().expm1() / h_eta + 1
                phi_3 = phi_2 / h_eta - 0.5
-                x = x + (alpha_t * phi_2) * d1 - (alpha_t * phi_3) * d2
+                x = x + phi_2 * d1 - phi_3 * d2
            elif h_1 is not None:
-                # DPM-Solver++(2M) SDE
                r = h_1 / h
                d = (denoised - denoised_1) / r
                phi_2 = h_eta.neg().expm1() / h_eta + 1
-                x = x + (alpha_t * phi_2) * d
+                x = x + phi_2 * d

-            if eta > 0 and s_noise > 0:
+            if eta:
                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise

        denoised_1, denoised_2 = denoised, denoised_1
        h_1, h_2 = h, h_1
    return x

-
@torch.no_grad()
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)

-
-@torch.no_grad()
-def sample_dpmpp_2m_sde_heun_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'):
-    if len(sigmas) <= 1:
-        return x
-    extra_args = {} if extra_args is None else extra_args
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
-    return sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
-
-
@torch.no_grad()
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)

-
@torch.no_grad()
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@@ -1097,9 +1009,7 @@ def sample_ipndm(model, x, sigmas, extra_args=None, callback=None, disable=None,
        d_cur = (x_cur - denoised) / t_cur

        order = min(max_order, i+1)
-        if t_next == 0:     # Denoising step
-            x_next = denoised
-        elif order == 1:    # First Euler step.
+        if order == 1:      # First Euler step.
            x_next = x_cur + (t_next - t_cur) * d_cur
        elif order == 2:    # Use one history point.
            x_next = x_cur + (t_next - t_cur) * (3 * d_cur - buffer_model[-1]) / 2
@@ -1117,7 +1027,6 @@ def sample_ipndm(model, x, sigmas, extra_args=None, callback=None, disable=None,

    return x_next

-
 #From https://github.com/zju-pi/diff-sampler/blob/main/diff-solvers-main/solvers.py
 #under Apache 2 license
 def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=None, max_order=4):
@@ -1141,9 +1050,7 @@ def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=Non
        d_cur = (x_cur - denoised) / t_cur

        order = min(max_order, i+1)
-        if t_next == 0:     # Denoising step
-            x_next = denoised
-        elif order == 1:    # First Euler step.
+        if order == 1:      # First Euler step.
            x_next = x_cur + (t_next - t_cur) * d_cur
        elif order == 2:    # Use one history point.
            h_n = (t_next - t_cur)
@@ -1183,7 +1090,6 @@ def sample_ipndm_v(model, x, sigmas, extra_args=None, callback=None, disable=Non

    return x_next

-
 #From https://github.com/zju-pi/diff-sampler/blob/main/diff-solvers-main/solvers.py
 #under Apache 2 license
@torch.no_grad()
@@ -1234,22 +1140,39 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None,

    return x_next

+@torch.no_grad()
+def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
+    extra_args = {} if extra_args is None else extra_args
+
+    temp = [0]
+    def post_cfg_function(args):
+        temp[0] = args["uncond_denoised"]
+        return args["denoised"]
+
+    model_options = extra_args.get("model_options", {}).copy()
+    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
+
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        sigma_hat = sigmas[i]
+        denoised = model(x, sigma_hat * s_in, **extra_args)
+        d = to_d(x, sigma_hat, temp[0])
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        # Euler method
+        x = denoised + d * sigmas[i + 1]
+    return x

@torch.no_grad()
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps (CFG++)."""
+    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler

-    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-
-    uncond_denoised = None
-
+    temp = [0]
    def post_cfg_function(args):
-        nonlocal uncond_denoised
-        uncond_denoised = args["uncond_denoised"]
+        temp[0] = args["uncond_denoised"]
        return args["denoised"]

    model_options = extra_args.get("model_options", {}).copy()
@@ -1258,33 +1181,15 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp()
-            alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp()
-            d = to_d(x, sigmas[i], alpha_s * uncond_denoised)   # to noise
-
-            # DDIM stochastic sampling
-            sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta)
-            sigma_down = alpha_t * sigma_down
-
-            # Euler method
-            x = alpha_t * denoised + sigma_down * d
-            if eta > 0 and s_noise > 0:
-                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+        d = to_d(x, sigmas[i], temp[0])
+        # Euler method
+        x = denoised + d * sigma_down
+        if sigmas[i + 1] > 0:
+            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
-
-
-@torch.no_grad()
-def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
-    """Euler method steps (CFG++)."""
-    return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None)
-
-
@torch.no_grad()
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
@@ -1362,7 +1267,7 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis
    return x

@torch.no_grad()
-def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, eta=1., cfg_pp=False):
+def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None, cfg_pp=False):
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
@@ -1372,7 +1277,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
    phi1_fn = lambda t: torch.expm1(t) / t
    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t

-    old_sigma_down = None
    old_denoised = None
    uncond_denoised = None
    def post_cfg_function(args):
@@ -1385,403 +1289,50 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)

    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        if s_churn > 0:
+            gamma = min(s_churn / (len(sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.0
+            sigma_hat = sigmas[i] * (gamma + 1)
+        else:
+            gamma = 0
+            sigma_hat = sigmas[i]
+
+        if gamma > 0:
+            eps = torch.randn_like(x) * s_noise
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
        if callback is not None:
-            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
-        if sigma_down == 0 or old_denoised is None:
+            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigma_hat, "denoised": denoised})
+        if sigmas[i + 1] == 0 or old_denoised is None:
            # Euler method
            if cfg_pp:
-                d = to_d(x, sigmas[i], uncond_denoised)
-                x = denoised + d * sigma_down
+                d = to_d(x, sigma_hat, uncond_denoised)
+                x = denoised + d * sigmas[i + 1]
            else:
-                d = to_d(x, sigmas[i], denoised)
-                dt = sigma_down - sigmas[i]
+                d = to_d(x, sigma_hat, denoised)
+                dt = sigmas[i + 1] - sigma_hat
                x = x + d * dt
        else:
            # Second order multistep method in https://arxiv.org/pdf/2308.02157
-            t, t_old, t_next, t_prev = t_fn(sigmas[i]), t_fn(old_sigma_down), t_fn(sigma_down), t_fn(sigmas[i - 1])
+            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigmas[i + 1]), t_fn(sigmas[i - 1])
            h = t_next - t
-            c2 = (t_prev - t_old) / h
+            c2 = (t_prev - t) / h

            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
-            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
-            b2 = torch.nan_to_num(phi2_val / c2, nan=0.0)
+            b1 = torch.nan_to_num(phi1_val - 1.0 / c2 * phi2_val, nan=0.0)
+            b2 = torch.nan_to_num(1.0 / c2 * phi2_val, nan=0.0)

            if cfg_pp:
                x = x + (denoised - uncond_denoised)
-                x = sigma_fn(h) * x + h * (b1 * uncond_denoised + b2 * old_denoised)
-            else:
-                x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised)

-        # Noise addition
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x + h * (b1 * denoised + b2 * old_denoised)

-        if cfg_pp:
-            old_denoised = uncond_denoised
-        else:
-            old_denoised = denoised
-        old_sigma_down = sigma_down
-    return x
-
-@torch.no_grad()
-def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=True)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
-
-
-@torch.no_grad()
-def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2., cfg_pp=False):
-    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    old_d = None
-
-    uncond_denoised = None
-    def post_cfg_function(args):
-        nonlocal uncond_denoised
-        uncond_denoised = args["uncond_denoised"]
-        return args["denoised"]
-
-    if cfg_pp:
-        model_options = extra_args.get("model_options", {}).copy()
-        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if cfg_pp:
-            d = to_d(x, sigmas[i], uncond_denoised)
-        else:
-            d = to_d(x, sigmas[i], denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        dt = sigmas[i + 1] - sigmas[i]
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            # Euler method
-            if cfg_pp:
-                x = denoised + d * sigmas[i + 1]
-            else:
-                x = x + d * dt
-
-            if i >= 1:
-                # Gradient estimation
-                d_bar = (ge_gamma - 1) * (d - old_d)
-                x = x + d_bar * dt
-        old_d = d
-    return x
-
-
-@torch.no_grad()
-def sample_gradient_estimation_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
-    return sample_gradient_estimation(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, ge_gamma=ge_gamma, cfg_pp=True)
-
-
-@torch.no_grad()
-def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1.0, noise_sampler=None, noise_scaler=None, max_stage=3):
-    """Extended Reverse-Time SDE solver (VP ER-SDE-Solver-3). arXiv: https://arxiv.org/abs/2309.06169.
-    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
-    """
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    def default_er_sde_noise_scaler(x):
-        return x * ((x ** 0.3).exp() + 10.0)
-
-    noise_scaler = default_er_sde_noise_scaler if noise_scaler is None else noise_scaler
-    num_integration_points = 200.0
-    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
-
-    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-    half_log_snrs = sigma_to_half_log_snr(sigmas, model_sampling)
-    er_lambdas = half_log_snrs.neg().exp()  # er_lambda_t = sigma_t / alpha_t
-
-    old_denoised = None
-    old_denoised_d = None
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        stage_used = min(max_stage, i + 1)
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            er_lambda_s, er_lambda_t = er_lambdas[i], er_lambdas[i + 1]
-            alpha_s = sigmas[i] / er_lambda_s
-            alpha_t = sigmas[i + 1] / er_lambda_t
-            r_alpha = alpha_t / alpha_s
-            r = noise_scaler(er_lambda_t) / noise_scaler(er_lambda_s)
-
-            # Stage 1 Euler
-            x = r_alpha * r * x + alpha_t * (1 - r) * denoised
-
-            if stage_used >= 2:
-                dt = er_lambda_t - er_lambda_s
-                lambda_step_size = -dt / num_integration_points
-                lambda_pos = er_lambda_t + point_indice * lambda_step_size
-                scaled_pos = noise_scaler(lambda_pos)
-
-                # Stage 2
-                s = torch.sum(1 / scaled_pos) * lambda_step_size
-                denoised_d = (denoised - old_denoised) / (er_lambda_s - er_lambdas[i - 1])
-                x = x + alpha_t * (dt + s * noise_scaler(er_lambda_t)) * denoised_d
-
-                if stage_used >= 3:
-                    # Stage 3
-                    s_u = torch.sum((lambda_pos - er_lambda_s) / scaled_pos) * lambda_step_size
-                    denoised_u = (denoised_d - old_denoised_d) / ((er_lambda_s - er_lambdas[i - 2]) / 2)
-                    x = x + alpha_t * ((dt ** 2) / 2 + s_u * noise_scaler(er_lambda_t)) * denoised_u
-                old_denoised_d = denoised_d
-
-            if s_noise > 0:
-                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (er_lambda_t ** 2 - er_lambda_s ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
        old_denoised = denoised
    return x

+@torch.no_grad()
+def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=False)

@torch.no_grad()
-def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
-    """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2.
-    arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
-    """
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    inject_noise = eta > 0 and s_noise > 0
-
-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-
-    fac = 1 / (2 * r)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-
-        if sigmas[i + 1] == 0:
-            x = denoised
-            continue
-
-        lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-        h = lambda_t - lambda_s
-        h_eta = h * (eta + 1)
-        lambda_s_1 = torch.lerp(lambda_s, lambda_t, r)
-        sigma_s_1 = sigma_fn(lambda_s_1)
-
-        alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
-        alpha_t = sigmas[i + 1] * lambda_t.exp()
-
-        # Step 1
-        x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r * h_eta) * denoised
-        if inject_noise:
-            sde_noise = (-2 * r * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1)
-            x_2 = x_2 + sde_noise * sigma_s_1 * s_noise
-        denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
-
-        # Step 2
-        denoised_d = torch.lerp(denoised, denoised_2, fac)
-        x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d
-        if inject_noise:
-            segment_factor = (r - 1) * h * eta
-            sde_noise = sde_noise * segment_factor.exp()
-            sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigmas[i + 1])
-            x = x + sde_noise * sigmas[i + 1] * s_noise
-    return x
-
-
-@torch.no_grad()
-def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
-    """SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 3.
-    arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023)
-    """
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    inject_noise = eta > 0 and s_noise > 0
-
-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
-    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-
-        if sigmas[i + 1] == 0:
-            x = denoised
-            continue
-
-        lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1])
-        h = lambda_t - lambda_s
-        h_eta = h * (eta + 1)
-        lambda_s_1 = torch.lerp(lambda_s, lambda_t, r_1)
-        lambda_s_2 = torch.lerp(lambda_s, lambda_t, r_2)
-        sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2)
-
-        alpha_s_1 = sigma_s_1 * lambda_s_1.exp()
-        alpha_s_2 = sigma_s_2 * lambda_s_2.exp()
-        alpha_t = sigmas[i + 1] * lambda_t.exp()
-
-        # Step 1
-        x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r_1 * h_eta) * denoised
-        if inject_noise:
-            sde_noise = (-2 * r_1 * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1)
-            x_2 = x_2 + sde_noise * sigma_s_1 * s_noise
-        denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
-
-        # Step 2
-        a3_2 = r_2 / r_1 * ei_h_phi_2(-r_2 * h_eta)
-        a3_1 = ei_h_phi_1(-r_2 * h_eta) - a3_2
-        x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * (a3_1 * denoised + a3_2 * denoised_2)
-        if inject_noise:
-            segment_factor = (r_1 - r_2) * h * eta
-            sde_noise = sde_noise * segment_factor.exp()
-            sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigma_s_2)
-            x_3 = x_3 + sde_noise * sigma_s_2 * s_noise
-        denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
-
-        # Step 3
-        b3 = ei_h_phi_2(-h_eta) / r_2
-        b1 = ei_h_phi_1(-h_eta) - b3
-        x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b3 * denoised_3)
-        if inject_noise:
-            segment_factor = (r_2 - 1) * h * eta
-            sde_noise = sde_noise * segment_factor.exp()
-            sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_2, sigmas[i + 1])
-            x = x + sde_noise * sigmas[i + 1] * s_noise
-    return x
-
-
-@torch.no_grad()
-def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, use_pece=False, simple_order_2=False):
-    """Stochastic Adams Solver with predictor-corrector method (NeurIPS 2023)."""
-    if len(sigmas) <= 1:
-        return x
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
-    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-    lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
-
-    if tau_func is None:
-        # Use default interval for stochastic sampling
-        start_sigma = model_sampling.percent_to_sigma(0.2)
-        end_sigma = model_sampling.percent_to_sigma(0.8)
-        tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=1.0)
-
-    max_used_order = max(predictor_order, corrector_order)
-    x_pred = x  # x: current state, x_pred: predicted next state
-
-    h = 0.0
-    tau_t = 0.0
-    noise = 0.0
-    pred_list = []
-
-    # Lower order near the end to improve stability
-    lower_order_to_end = sigmas[-1].item() == 0
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        # Evaluation
-        denoised = model(x_pred, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({"x": x_pred, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
-        pred_list.append(denoised)
-        pred_list = pred_list[-max_used_order:]
-
-        predictor_order_used = min(predictor_order, len(pred_list))
-        if i == 0 or (sigmas[i + 1] == 0 and not use_pece):
-            corrector_order_used = 0
-        else:
-            corrector_order_used = min(corrector_order, len(pred_list))
-
-        if lower_order_to_end:
-            predictor_order_used = min(predictor_order_used, len(sigmas) - 2 - i)
-            corrector_order_used = min(corrector_order_used, len(sigmas) - 1 - i)
-
-        # Corrector
-        if corrector_order_used == 0:
-            # Update by the predicted state
-            x = x_pred
-        else:
-            curr_lambdas = lambdas[i - corrector_order_used + 1:i + 1]
-            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
-                sigmas[i],
-                curr_lambdas,
-                lambdas[i - 1],
-                lambdas[i],
-                tau_t,
-                simple_order_2,
-                is_corrector_step=True,
-            )
-            pred_mat = torch.stack(pred_list[-corrector_order_used:], dim=1)    # (B, K, ...)
-            corr_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
-            x = sigmas[i] / sigmas[i - 1] * (-(tau_t ** 2) * h).exp() * x + corr_res
-
-            if tau_t > 0 and s_noise > 0:
-                # The noise from the previous predictor step
-                x = x + noise
-
-            if use_pece:
-                # Evaluate the corrected state
-                denoised = model(x, sigmas[i] * s_in, **extra_args)
-                pred_list[-1] = denoised
-
-        # Predictor
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            tau_t = tau_func(sigmas[i + 1])
-            curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1]
-            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
-                sigmas[i + 1],
-                curr_lambdas,
-                lambdas[i],
-                lambdas[i + 1],
-                tau_t,
-                simple_order_2,
-                is_corrector_step=False,
-            )
-            pred_mat = torch.stack(pred_list[-predictor_order_used:], dim=1)    # (B, K, ...)
-            pred_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
-            h = lambdas[i + 1] - lambdas[i]
-            x_pred = sigmas[i + 1] / sigmas[i] * (-(tau_t ** 2) * h).exp() * x + pred_res
-
-            if tau_t > 0 and s_noise > 0:
-                noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise
-                x_pred = x_pred + noise
-    return x
-
-
-@torch.no_grad()
-def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
-    """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
-    return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
+def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=True)
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -407,142 +407,3 @@ class Cosmos1CV8x8x8(LatentFormat):
    ]

    latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
-
-class Wan21(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-
-    latent_rgb_factors = [
-            [-0.1299, -0.1692,  0.2932],
-            [ 0.0671,  0.0406,  0.0442],
-            [ 0.3568,  0.2548,  0.1747],
-            [ 0.0372,  0.2344,  0.1420],
-            [ 0.0313,  0.0189, -0.0328],
-            [ 0.0296, -0.0956, -0.0665],
-            [-0.3477, -0.4059, -0.2925],
-            [ 0.0166,  0.1902,  0.1975],
-            [-0.0412,  0.0267, -0.1364],
-            [-0.1293,  0.0740,  0.1636],
-            [ 0.0680,  0.3019,  0.1128],
-            [ 0.0032,  0.0581,  0.0639],
-            [-0.1251,  0.0927,  0.1699],
-            [ 0.0060, -0.0633,  0.0005],
-            [ 0.3477,  0.2275,  0.2950],
-            [ 0.1984,  0.0913,  0.1861]
-        ]
-
-    latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
-
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latents_mean = torch.tensor([
-            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
-            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
-        ]).view(1, self.latent_channels, 1, 1, 1)
-        self.latents_std = torch.tensor([
-            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
-            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
-        ]).view(1, self.latent_channels, 1, 1, 1)
-
-
-        self.taesd_decoder_name = None #TODO
-
-    def process_in(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return (latent - latents_mean) * self.scale_factor / latents_std
-
-    def process_out(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return latent * latents_std / self.scale_factor + latents_mean
-
-class Wan22(Wan21):
-    latent_channels = 48
-    latent_dimensions = 3
-
-    latent_rgb_factors = [
-            [ 0.0119,  0.0103,  0.0046],
-            [-0.1062, -0.0504,  0.0165],
-            [ 0.0140,  0.0409,  0.0491],
-            [-0.0813, -0.0677,  0.0607],
-            [ 0.0656,  0.0851,  0.0808],
-            [ 0.0264,  0.0463,  0.0912],
-            [ 0.0295,  0.0326,  0.0590],
-            [-0.0244, -0.0270,  0.0025],
-            [ 0.0443, -0.0102,  0.0288],
-            [-0.0465, -0.0090, -0.0205],
-            [ 0.0359,  0.0236,  0.0082],
-            [-0.0776,  0.0854,  0.1048],
-            [ 0.0564,  0.0264,  0.0561],
-            [ 0.0006,  0.0594,  0.0418],
-            [-0.0319, -0.0542, -0.0637],
-            [-0.0268,  0.0024,  0.0260],
-            [ 0.0539,  0.0265,  0.0358],
-            [-0.0359, -0.0312, -0.0287],
-            [-0.0285, -0.1032, -0.1237],
-            [ 0.1041,  0.0537,  0.0622],
-            [-0.0086, -0.0374, -0.0051],
-            [ 0.0390,  0.0670,  0.2863],
-            [ 0.0069,  0.0144,  0.0082],
-            [ 0.0006, -0.0167,  0.0079],
-            [ 0.0313, -0.0574, -0.0232],
-            [-0.1454, -0.0902, -0.0481],
-            [ 0.0714,  0.0827,  0.0447],
-            [-0.0304, -0.0574, -0.0196],
-            [ 0.0401,  0.0384,  0.0204],
-            [-0.0758, -0.0297, -0.0014],
-            [ 0.0568,  0.1307,  0.1372],
-            [-0.0055, -0.0310, -0.0380],
-            [ 0.0239, -0.0305,  0.0325],
-            [-0.0663, -0.0673, -0.0140],
-            [-0.0416, -0.0047, -0.0023],
-            [ 0.0166,  0.0112, -0.0093],
-            [-0.0211,  0.0011,  0.0331],
-            [ 0.1833,  0.1466,  0.2250],
-            [-0.0368,  0.0370,  0.0295],
-            [-0.3441, -0.3543, -0.2008],
-            [-0.0479, -0.0489, -0.0420],
-            [-0.0660, -0.0153,  0.0800],
-            [-0.0101,  0.0068,  0.0156],
-            [-0.0690, -0.0452, -0.0927],
-            [-0.0145,  0.0041,  0.0015],
-            [ 0.0421,  0.0451,  0.0373],
-            [ 0.0504, -0.0483, -0.0356],
-            [-0.0837,  0.0168,  0.0055]
-        ]
-
-    latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388]
-
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latents_mean = torch.tensor([
-                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
-                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
-                -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
-                -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
-                -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
-                0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667,
-            ]).view(1, self.latent_channels, 1, 1, 1)
-        self.latents_std = torch.tensor([
-                0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
-                0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
-                0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
-                0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
-                0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
-                0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
-            ]).view(1, self.latent_channels, 1, 1, 1)
-
-class Hunyuan3Dv2(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 0.9990943042622529
-
-class Hunyuan3Dv2mini(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 1.0188137142395404
-
-class ACEAudio(LatentFormat):
-    latent_channels = 8
-    latent_dimensions = 2
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@@ -1,761 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/attention.py
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Tuple, Union, Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        kv_heads: Optional[int] = None,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        qk_norm: Optional[str] = None,
-        added_kv_proj_dim: Optional[int] = None,
-        added_proj_bias: Optional[bool] = True,
-        out_bias: bool = True,
-        scale_qk: bool = True,
-        only_cross_attention: bool = False,
-        eps: float = 1e-5,
-        rescale_output_factor: float = 1.0,
-        residual_connection: bool = False,
-        processor=None,
-        out_dim: int = None,
-        out_context_dim: int = None,
-        context_pre_only=None,
-        pre_only=False,
-        elementwise_affine: bool = True,
-        is_causal: bool = False,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
-        self.query_dim = query_dim
-        self.use_bias = bias
-        self.is_cross_attention = cross_attention_dim is not None
-        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
-        self.rescale_output_factor = rescale_output_factor
-        self.residual_connection = residual_connection
-        self.dropout = dropout
-        self.fused_projections = False
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
-        self.context_pre_only = context_pre_only
-        self.pre_only = pre_only
-        self.is_causal = is_causal
-
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self.sliceable_head_dim = heads
-
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.only_cross_attention = only_cross_attention
-
-        if self.added_kv_proj_dim is None and self.only_cross_attention:
-            raise ValueError(
-                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
-            )
-
-        self.group_norm = None
-        self.spatial_norm = None
-
-        self.norm_q = None
-        self.norm_k = None
-
-        self.norm_cross = None
-        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
-            self.to_v = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
-        else:
-            self.to_k = None
-            self.to_v = None
-
-        self.added_proj_bias = added_proj_bias
-        if self.added_kv_proj_dim is not None:
-            self.add_k_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
-            self.add_v_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
-            if self.context_pre_only is not None:
-                self.add_q_proj = operations.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias, dtype=dtype, device=device)
-        else:
-            self.add_q_proj = None
-            self.add_k_proj = None
-            self.add_v_proj = None
-
-        if not self.pre_only:
-            self.to_out = nn.ModuleList([])
-            self.to_out.append(operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device))
-            self.to_out.append(nn.Dropout(dropout))
-        else:
-            self.to_out = None
-
-        if self.context_pre_only is not None and not self.context_pre_only:
-            self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
-        else:
-            self.to_add_out = None
-
-        self.norm_added_q = None
-        self.norm_added_k = None
-        self.processor = processor
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **cross_attention_kwargs,
-    ) -> torch.Tensor:
-        return self.processor(
-            self,
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-
-
-class CustomLiteLAProcessor2_0:
-    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
-
-    def __init__(self):
-        self.kernel_func = nn.ReLU(inplace=False)
-        self.eps = 1e-15
-        self.pad_val = 1.0
-
-    def apply_rotary_emb(
-        self,
-        x: torch.Tensor,
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-        tensors contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x (`torch.Tensor`):
-                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-        """
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-
-        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-
-        return out
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        hidden_states_len = hidden_states.shape[1]
-
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        if encoder_hidden_states is not None:
-            context_input_ndim = encoder_hidden_states.ndim
-            if context_input_ndim == 4:
-                batch_size, channel, height, width = encoder_hidden_states.shape
-                encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size = hidden_states.shape[0]
-
-        # `sample` projections.
-        dtype = hidden_states.dtype
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-
-        # `context` projections.
-        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
-        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
-            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
-            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-
-            # attention
-            if not attn.is_cross_attention:
-                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
-                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
-                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
-            else:
-                query = hidden_states
-                key = encoder_hidden_states
-                value = encoder_hidden_states
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
-        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-
-        # RoPE需要 [B, H, S, D] 输入
-        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
-        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
-
-        # Apply query and key normalization if needed
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if rotary_freqs_cis is not None:
-            query = self.apply_rotary_emb(query, rotary_freqs_cis)
-            if not attn.is_cross_attention:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis)
-            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
-
-        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
-        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
-
-        if attention_mask is not None:
-            # attention_mask: [B, S] -> [B, 1, S, 1]
-            attention_mask = attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S, 1]
-            query = query * attention_mask.permute(0, 1, 3, 2)  # [B, H, S, D] * [B, 1, S, 1]
-            if not attn.is_cross_attention:
-                key = key * attention_mask  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
-                value = value * attention_mask.permute(0, 1, 3, 2)  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
-
-        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
-            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S_enc, 1]
-            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
-            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
-            value = value * encoder_attention_mask.permute(0, 1, 3, 2)  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
-
-        query = self.kernel_func(query)
-        key = self.kernel_func(key)
-
-        query, key, value = query.float(), key.float(), value.float()
-
-        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
-
-        vk = torch.matmul(value, key)
-
-        hidden_states = torch.matmul(vk, query)
-
-        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.float()
-
-        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
-
-        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
-
-        hidden_states = hidden_states.to(dtype)
-        if encoder_hidden_states is not None:
-            encoder_hidden_states = encoder_hidden_states.to(dtype)
-
-        # Split the attention outputs.
-        if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj:
-            hidden_states, encoder_hidden_states = (
-                hidden_states[:, : hidden_states_len],
-                hidden_states[:, hidden_states_len:],
-            )
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"):
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if encoder_hidden_states is not None and context_input_ndim == 4:
-            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if torch.get_autocast_gpu_dtype() == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-            if encoder_hidden_states is not None:
-                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-
-        return hidden_states, encoder_hidden_states
-
-
-class CustomerAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def apply_rotary_emb(
-        self,
-        x: torch.Tensor,
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-        tensors contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x (`torch.Tensor`):
-                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-        """
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-
-        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-
-        return out
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> torch.Tensor:
-
-        residual = hidden_states
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if rotary_freqs_cis is not None:
-            query = self.apply_rotary_emb(query, rotary_freqs_cis)
-            if not attn.is_cross_attention:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis)
-            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
-
-        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
-            # attention_mask: N x S1
-            # encoder_attention_mask: N x S2
-            # cross attention 整合attention_mask和encoder_attention_mask
-            combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
-            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
-            attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype)
-
-        elif not attn.is_cross_attention and attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        hidden_states = optimized_attention(
-            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
-        ).to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-def val2list(x: list or tuple or any, repeat_time=1) -> list:  # type: ignore
-    """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
-    if isinstance(x, (list, tuple)):
-        return list(x)
-    return [x for _ in range(repeat_time)]
-
-
-def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:  # type: ignore
-    """Return tuple with min_len by repeating element at idx_repeat."""
-    # convert to list first
-    x = val2list(x)
-
-    # repeat elements if necessary
-    if len(x) > 0:
-        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
-
-    return tuple(x)
-
-
-def t2i_modulate(x, shift, scale):
-    return x * (1 + scale) + shift
-
-
-def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
-    if isinstance(kernel_size, tuple):
-        return tuple([get_same_padding(ks) for ks in kernel_size])
-    else:
-        assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
-        return kernel_size // 2
-
-class ConvLayer(nn.Module):
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        kernel_size=3,
-        stride=1,
-        dilation=1,
-        groups=1,
-        padding: Union[int, None] = None,
-        use_bias=False,
-        norm=None,
-        act=None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        if padding is None:
-            padding = get_same_padding(kernel_size)
-            padding *= dilation
-
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.padding = padding
-        self.use_bias = use_bias
-
-        self.conv = operations.Conv1d(
-            in_dim,
-            out_dim,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=use_bias,
-            device=device,
-            dtype=dtype
-        )
-        if norm is not None:
-            self.norm = operations.RMSNorm(out_dim, elementwise_affine=False, dtype=dtype, device=device)
-        else:
-            self.norm = None
-        if act is not None:
-            self.act = nn.SiLU(inplace=True)
-        else:
-            self.act = None
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x)
-        if self.norm:
-            x = self.norm(x)
-        if self.act:
-            x = self.act(x)
-        return x
-
-
-class GLUMBConv(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: int,
-        out_feature=None,
-        kernel_size=3,
-        stride=1,
-        padding: Union[int, None] = None,
-        use_bias=False,
-        norm=(None, None, None),
-        act=("silu", "silu", None),
-        dilation=1,
-        dtype=None, device=None, operations=None
-    ):
-        out_feature = out_feature or in_features
-        super().__init__()
-        use_bias = val2tuple(use_bias, 3)
-        norm = val2tuple(norm, 3)
-        act = val2tuple(act, 3)
-
-        self.glu_act = nn.SiLU(inplace=False)
-        self.inverted_conv = ConvLayer(
-            in_features,
-            hidden_features * 2,
-            1,
-            use_bias=use_bias[0],
-            norm=norm[0],
-            act=act[0],
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.depth_conv = ConvLayer(
-            hidden_features * 2,
-            hidden_features * 2,
-            kernel_size,
-            stride=stride,
-            groups=hidden_features * 2,
-            padding=padding,
-            use_bias=use_bias[1],
-            norm=norm[1],
-            act=None,
-            dilation=dilation,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.point_conv = ConvLayer(
-            hidden_features,
-            out_feature,
-            1,
-            use_bias=use_bias[2],
-            norm=norm[2],
-            act=act[2],
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x.transpose(1, 2)
-        x = self.inverted_conv(x)
-        x = self.depth_conv(x)
-
-        x, gate = torch.chunk(x, 2, dim=1)
-        gate = self.glu_act(gate)
-        x = x * gate
-
-        x = self.point_conv(x)
-        x = x.transpose(1, 2)
-
-        return x
-
-
-class LinearTransformerBlock(nn.Module):
-    """
-    A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
-    """
-    def __init__(
-        self,
-        dim,
-        num_attention_heads,
-        attention_head_dim,
-        use_adaln_single=True,
-        cross_attention_dim=None,
-        added_kv_proj_dim=None,
-        context_pre_only=False,
-        mlp_ratio=4.0,
-        add_cross_attention=False,
-        add_cross_attention_dim=None,
-        qk_norm=None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.norm1 = operations.RMSNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.attn = Attention(
-            query_dim=dim,
-            cross_attention_dim=cross_attention_dim,
-            added_kv_proj_dim=added_kv_proj_dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=True,
-            qk_norm=qk_norm,
-            processor=CustomLiteLAProcessor2_0(),
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.add_cross_attention = add_cross_attention
-        self.context_pre_only = context_pre_only
-
-        if add_cross_attention and add_cross_attention_dim is not None:
-            self.cross_attn = Attention(
-                query_dim=dim,
-                cross_attention_dim=add_cross_attention_dim,
-                added_kv_proj_dim=add_cross_attention_dim,
-                dim_head=attention_head_dim,
-                heads=num_attention_heads,
-                out_dim=dim,
-                context_pre_only=context_pre_only,
-                bias=True,
-                qk_norm=qk_norm,
-                processor=CustomerAttnProcessor2_0(),
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            )
-
-        self.norm2 = operations.RMSNorm(dim, 1e-06, elementwise_affine=False)
-
-        self.ff = GLUMBConv(
-            in_features=dim,
-            hidden_features=int(dim * mlp_ratio),
-            use_bias=(True, True, False),
-            norm=(None, None, None),
-            act=("silu", "silu", None),
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.use_adaln_single = use_adaln_single
-        if use_adaln_single:
-            self.scale_shift_table = nn.Parameter(torch.empty(6, dim, dtype=dtype, device=device))
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: torch.FloatTensor = None,
-        encoder_attention_mask: torch.FloatTensor = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        temb: torch.FloatTensor = None,
-    ):
-
-        N = hidden_states.shape[0]
-
-        # step 1: AdaLN single
-        if self.use_adaln_single:
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                comfy.model_management.cast_to(self.scale_shift_table[None], dtype=temb.dtype, device=temb.device) + temb.reshape(N, 6, -1)
-            ).chunk(6, dim=1)
-
-        norm_hidden_states = self.norm1(hidden_states)
-        if self.use_adaln_single:
-            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-
-        # step 2: attention
-        if not self.add_cross_attention:
-            attn_output, encoder_hidden_states = self.attn(
-                hidden_states=norm_hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-            )
-        else:
-            attn_output, _ = self.attn(
-                hidden_states=norm_hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=None,
-            )
-
-        if self.use_adaln_single:
-            attn_output = gate_msa * attn_output
-        hidden_states = attn_output + hidden_states
-
-        if self.add_cross_attention:
-            attn_output = self.cross_attn(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-            )
-            hidden_states = attn_output + hidden_states
-
-        # step 3: add norm
-        norm_hidden_states = self.norm2(hidden_states)
-        if self.use_adaln_single:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
-        # step 4: feed forward
-        ff_output = self.ff(norm_hidden_states)
-        if self.use_adaln_single:
-            ff_output = gate_mlp * ff_output
-
-        hidden_states = hidden_states + ff_output
-
-        return hidden_states
--- a/comfy/ldm/ace/lyric_encoder.py
+++ b/comfy/ldm/ace/lyric_encoder.py
--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@@ -1,407 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/ace_step_transformer.py
-
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, List, Union
-
-import torch
-from torch import nn
-
-import comfy.model_management
-import comfy.patcher_extension
-
-from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
-from .attention import LinearTransformerBlock, t2i_modulate
-from .lyric_encoder import ConformerEncoder as LyricEncoder
-
-
-def cross_norm(hidden_states, controlnet_input):
-    # input N x T x c
-    mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
-    mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
-    controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
-    return controlnet_input
-
-
-# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, dtype=None, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=device).float() / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-class T2IFinalLayer(nn.Module):
-    """
-    The final layer of Sana.
-    """
-
-    def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True, dtype=dtype, device=device)
-        self.scale_shift_table = nn.Parameter(torch.empty(2, hidden_size, dtype=dtype, device=device))
-        self.out_channels = out_channels
-        self.patch_size = patch_size
-
-    def unpatchfy(
-        self,
-        hidden_states: torch.Tensor,
-        width: int,
-    ):
-        # 4 unpatchify
-        new_height, new_width = 1, hidden_states.size(1)
-        hidden_states = hidden_states.reshape(
-            shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
-        ).contiguous()
-        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-        output = hidden_states.reshape(
-            shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
-        ).contiguous()
-        if width > new_width:
-            output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
-        elif width < new_width:
-            output = output[:, :, :, :width]
-        return output
-
-    def forward(self, x, t, output_length):
-        shift, scale = (comfy.model_management.cast_to(self.scale_shift_table[None], device=t.device, dtype=t.dtype) + t[:, None]).chunk(2, dim=1)
-        x = t2i_modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        # unpatchify
-        output = self.unpatchfy(x, output_length)
-        return output
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        height=16,
-        width=4096,
-        patch_size=(16, 1),
-        in_channels=8,
-        embed_dim=1152,
-        bias=True,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        patch_size_h, patch_size_w = patch_size
-        self.early_conv_layers = nn.Sequential(
-            operations.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias, dtype=dtype, device=device),
-            operations.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True, dtype=dtype, device=device),
-            operations.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, dtype=dtype, device=device)
-        )
-        self.patch_size = patch_size
-        self.height, self.width = height // patch_size_h, width // patch_size_w
-        self.base_size = self.width
-
-    def forward(self, latent):
-        # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
-        latent = self.early_conv_layers(latent)
-        latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        return latent
-
-
-class ACEStepTransformer2DModel(nn.Module):
-    # _supports_gradient_checkpointing = True
-
-    def __init__(
-        self,
-        in_channels: Optional[int] = 8,
-        num_layers: int = 28,
-        inner_dim: int = 1536,
-        attention_head_dim: int = 64,
-        num_attention_heads: int = 24,
-        mlp_ratio: float = 4.0,
-        out_channels: int = 8,
-        max_position: int = 32768,
-        rope_theta: float = 1000000.0,
-        speaker_embedding_dim: int = 512,
-        text_embedding_dim: int = 768,
-        ssl_encoder_depths: List[int] = [9, 9],
-        ssl_names: List[str] = ["mert", "m-hubert"],
-        ssl_latent_dims: List[int] = [1024, 768],
-        lyric_encoder_vocab_size: int = 6681,
-        lyric_hidden_size: int = 1024,
-        patch_size: List[int] = [16, 1],
-        max_height: int = 16,
-        max_width: int = 4096,
-        audio_model=None,
-        dtype=None, device=None, operations=None
-
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        self.inner_dim = inner_dim
-        self.out_channels = out_channels
-        self.max_position = max_position
-        self.patch_size = patch_size
-
-        self.rope_theta = rope_theta
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position,
-            base=self.rope_theta,
-            dtype=dtype,
-            device=device,
-        )
-
-        # 2. Define input layers
-        self.in_channels = in_channels
-
-        self.num_layers = num_layers
-        # 3. Define transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                LinearTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=self.num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    add_cross_attention=True,
-                    add_cross_attention_dim=self.inner_dim,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                )
-                for i in range(self.num_layers)
-            ]
-        )
-
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.t_block = nn.Sequential(nn.SiLU(), operations.Linear(self.inner_dim, 6 * self.inner_dim, bias=True, dtype=dtype, device=device))
-
-        # speaker
-        self.speaker_embedder = operations.Linear(speaker_embedding_dim, self.inner_dim, dtype=dtype, device=device)
-
-        # genre
-        self.genre_embedder = operations.Linear(text_embedding_dim, self.inner_dim, dtype=dtype, device=device)
-
-        # lyric
-        self.lyric_embs = operations.Embedding(lyric_encoder_vocab_size, lyric_hidden_size, dtype=dtype, device=device)
-        self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0, dtype=dtype, device=device, operations=operations)
-        self.lyric_proj = operations.Linear(lyric_hidden_size, self.inner_dim, dtype=dtype, device=device)
-
-        projector_dim = 2 * self.inner_dim
-
-        self.projectors = nn.ModuleList([
-            nn.Sequential(
-                operations.Linear(self.inner_dim, projector_dim, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(projector_dim, projector_dim, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(projector_dim, ssl_dim, dtype=dtype, device=device),
-            ) for ssl_dim in ssl_latent_dims
-        ])
-
-        self.proj_in = PatchEmbed(
-            height=max_height,
-            width=max_width,
-            patch_size=patch_size,
-            embed_dim=self.inner_dim,
-            bias=True,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward_lyric_encoder(
-        self,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        out_dtype=None,
-    ):
-        # N x T x D
-        lyric_embs = self.lyric_embs(lyric_token_idx, out_dtype=out_dtype)
-        prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
-        prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
-        return prompt_prenet_out
-
-    def encode(
-        self,
-        encoder_text_hidden_states: Optional[torch.Tensor] = None,
-        text_attention_mask: Optional[torch.LongTensor] = None,
-        speaker_embeds: Optional[torch.FloatTensor] = None,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        lyrics_strength=1.0,
-    ):
-
-        bs = encoder_text_hidden_states.shape[0]
-        device = encoder_text_hidden_states.device
-
-        # speaker embedding
-        encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
-
-        # genre embedding
-        encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
-
-        # lyric
-        encoder_lyric_hidden_states = self.forward_lyric_encoder(
-            lyric_token_idx=lyric_token_idx,
-            lyric_mask=lyric_mask,
-            out_dtype=encoder_text_hidden_states.dtype,
-        )
-
-        encoder_lyric_hidden_states *= lyrics_strength
-
-        encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
-
-        encoder_hidden_mask = None
-        if text_attention_mask is not None:
-            speaker_mask = torch.ones(bs, 1, device=device)
-            encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
-
-        return encoder_hidden_states, encoder_hidden_mask
-
-    def decode(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_hidden_mask: torch.Tensor,
-        timestep: Optional[torch.Tensor],
-        output_length: int = 0,
-        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-    ):
-        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
-        temb = self.t_block(embedded_timestep)
-
-        hidden_states = self.proj_in(hidden_states)
-
-        # controlnet logic
-        if block_controlnet_hidden_states is not None:
-            control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
-            hidden_states = hidden_states + control_condi * controlnet_scale
-
-        # inner_hidden_states = []
-
-        rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
-        encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
-
-        for index_block, block in enumerate(self.transformer_blocks):
-            hidden_states = block(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_hidden_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
-                temb=temb,
-            )
-
-        output = self.final_layer(hidden_states, embedded_timestep, output_length)
-        return output
-
-    def forward(self,
-        x,
-        timestep,
-        attention_mask=None,
-        context: Optional[torch.Tensor] = None,
-        text_attention_mask: Optional[torch.LongTensor] = None,
-        speaker_embeds: Optional[torch.FloatTensor] = None,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-        lyrics_strength=1.0,
-        **kwargs
-    ):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {}))
-        ).execute(x, timestep, attention_mask, context, text_attention_mask, speaker_embeds, lyric_token_idx, lyric_mask, block_controlnet_hidden_states,
-                  controlnet_scale, lyrics_strength, **kwargs)
-
-    def _forward(
-        self,
-        x,
-        timestep,
-        attention_mask=None,
-        context: Optional[torch.Tensor] = None,
-        text_attention_mask: Optional[torch.LongTensor] = None,
-        speaker_embeds: Optional[torch.FloatTensor] = None,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-        lyrics_strength=1.0,
-        **kwargs
-    ):
-        hidden_states = x
-        encoder_text_hidden_states = context
-        encoder_hidden_states, encoder_hidden_mask = self.encode(
-            encoder_text_hidden_states=encoder_text_hidden_states,
-            text_attention_mask=text_attention_mask,
-            speaker_embeds=speaker_embeds,
-            lyric_token_idx=lyric_token_idx,
-            lyric_mask=lyric_mask,
-            lyrics_strength=lyrics_strength,
-        )
-
-        output_length = hidden_states.shape[-1]
-
-        output = self.decode(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_hidden_mask=encoder_hidden_mask,
-            timestep=timestep,
-            output_length=output_length,
-            block_controlnet_hidden_states=block_controlnet_hidden_states,
-            controlnet_scale=controlnet_scale,
-        )
-
-        return output
--- a/comfy/ldm/ace/vae/autoencoder_dc.py
+++ b/comfy/ldm/ace/vae/autoencoder_dc.py
@@ -1,644 +0,0 @@
-# Rewritten from diffusers
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Tuple, Union
-
-import comfy.model_management
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-
-class RMSNorm(ops.RMSNorm):
-    def __init__(self, dim, eps=1e-5, elementwise_affine=True, bias=False):
-        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
-        if elementwise_affine:
-            self.bias = nn.Parameter(torch.empty(dim)) if bias else None
-
-    def forward(self, x):
-        x = super().forward(x)
-        if self.elementwise_affine:
-            if self.bias is not None:
-                x = x + comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device)
-        return x
-
-
-def get_normalization(norm_type, num_features, num_groups=32, eps=1e-5):
-    if norm_type == "batch_norm":
-        return nn.BatchNorm2d(num_features)
-    elif norm_type == "group_norm":
-        return ops.GroupNorm(num_groups, num_features)
-    elif norm_type == "layer_norm":
-        return ops.LayerNorm(num_features)
-    elif norm_type == "rms_norm":
-        return RMSNorm(num_features, eps=eps, elementwise_affine=True, bias=True)
-    else:
-        raise ValueError(f"Unknown normalization type: {norm_type}")
-
-
-def get_activation(activation_type):
-    if activation_type == "relu":
-        return nn.ReLU()
-    elif activation_type == "relu6":
-        return nn.ReLU6()
-    elif activation_type == "silu":
-        return nn.SiLU()
-    elif activation_type == "leaky_relu":
-        return nn.LeakyReLU(0.2)
-    else:
-        raise ValueError(f"Unknown activation type: {activation_type}")
-
-
-class ResBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        norm_type: str = "batch_norm",
-        act_fn: str = "relu6",
-    ) -> None:
-        super().__init__()
-
-        self.norm_type = norm_type
-        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
-        self.conv1 = ops.Conv2d(in_channels, in_channels, 3, 1, 1)
-        self.conv2 = ops.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
-        self.norm = get_normalization(norm_type, out_channels)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.conv1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.norm_type == "rms_norm":
-            # move channel to the last dimension so we apply RMSnorm across channel dimension
-            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        else:
-            hidden_states = self.norm(hidden_states)
-
-        return hidden_states + residual
-
-class SanaMultiscaleAttentionProjection(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_attention_heads: int,
-        kernel_size: int,
-    ) -> None:
-        super().__init__()
-
-        channels = 3 * in_channels
-        self.proj_in = ops.Conv2d(
-            channels,
-            channels,
-            kernel_size,
-            padding=kernel_size // 2,
-            groups=channels,
-            bias=False,
-        )
-        self.proj_out = ops.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj_in(hidden_states)
-        hidden_states = self.proj_out(hidden_states)
-        return hidden_states
-
-class SanaMultiscaleLinearAttention(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_attention_heads: int = None,
-        attention_head_dim: int = 8,
-        mult: float = 1.0,
-        norm_type: str = "batch_norm",
-        kernel_sizes: tuple = (5,),
-        eps: float = 1e-15,
-        residual_connection: bool = False,
-    ):
-        super().__init__()
-
-        self.eps = eps
-        self.attention_head_dim = attention_head_dim
-        self.norm_type = norm_type
-        self.residual_connection = residual_connection
-
-        num_attention_heads = (
-            int(in_channels // attention_head_dim * mult)
-            if num_attention_heads is None
-            else num_attention_heads
-        )
-        inner_dim = num_attention_heads * attention_head_dim
-
-        self.to_q = ops.Linear(in_channels, inner_dim, bias=False)
-        self.to_k = ops.Linear(in_channels, inner_dim, bias=False)
-        self.to_v = ops.Linear(in_channels, inner_dim, bias=False)
-
-        self.to_qkv_multiscale = nn.ModuleList()
-        for kernel_size in kernel_sizes:
-            self.to_qkv_multiscale.append(
-                SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
-            )
-
-        self.nonlinearity = nn.ReLU()
-        self.to_out = ops.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
-        self.norm_out = get_normalization(norm_type, out_channels)
-
-    def apply_linear_attention(self, query, key, value):
-        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
-        scores = torch.matmul(value, key.transpose(-1, -2))
-        hidden_states = torch.matmul(scores, query)
-
-        hidden_states = hidden_states.to(dtype=torch.float32)
-        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
-        return hidden_states
-
-    def apply_quadratic_attention(self, query, key, value):
-        scores = torch.matmul(key.transpose(-1, -2), query)
-        scores = scores.to(dtype=torch.float32)
-        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
-        hidden_states = torch.matmul(value, scores.to(value.dtype))
-        return hidden_states
-
-    def forward(self, hidden_states):
-        height, width = hidden_states.shape[-2:]
-        if height * width > self.attention_head_dim:
-            use_linear_attention = True
-        else:
-            use_linear_attention = False
-
-        residual = hidden_states
-
-        batch_size, _, height, width = list(hidden_states.size())
-        original_dtype = hidden_states.dtype
-
-        hidden_states = hidden_states.movedim(1, -1)
-        query = self.to_q(hidden_states)
-        key = self.to_k(hidden_states)
-        value = self.to_v(hidden_states)
-        hidden_states = torch.cat([query, key, value], dim=3)
-        hidden_states = hidden_states.movedim(-1, 1)
-
-        multi_scale_qkv = [hidden_states]
-        for block in self.to_qkv_multiscale:
-            multi_scale_qkv.append(block(hidden_states))
-
-        hidden_states = torch.cat(multi_scale_qkv, dim=1)
-
-        if use_linear_attention:
-            # for linear attention upcast hidden_states to float32
-            hidden_states = hidden_states.to(dtype=torch.float32)
-
-        hidden_states = hidden_states.reshape(batch_size, -1, 3 * self.attention_head_dim, height * width)
-
-        query, key, value = hidden_states.chunk(3, dim=2)
-        query = self.nonlinearity(query)
-        key = self.nonlinearity(key)
-
-        if use_linear_attention:
-            hidden_states = self.apply_linear_attention(query, key, value)
-            hidden_states = hidden_states.to(dtype=original_dtype)
-        else:
-            hidden_states = self.apply_quadratic_attention(query, key, value)
-
-        hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
-        hidden_states = self.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-
-        if self.norm_type == "rms_norm":
-            hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        else:
-            hidden_states = self.norm_out(hidden_states)
-
-        if self.residual_connection:
-            hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-class EfficientViTBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        mult: float = 1.0,
-        attention_head_dim: int = 32,
-        qkv_multiscales: tuple = (5,),
-        norm_type: str = "batch_norm",
-    ) -> None:
-        super().__init__()
-
-        self.attn = SanaMultiscaleLinearAttention(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            mult=mult,
-            attention_head_dim=attention_head_dim,
-            norm_type=norm_type,
-            kernel_sizes=qkv_multiscales,
-            residual_connection=True,
-        )
-
-        self.conv_out = GLUMBConv(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            norm_type="rms_norm",
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.attn(x)
-        x = self.conv_out(x)
-        return x
-
-
-class GLUMBConv(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        expand_ratio: float = 4,
-        norm_type: str = None,
-        residual_connection: bool = True,
-    ) -> None:
-        super().__init__()
-
-        hidden_channels = int(expand_ratio * in_channels)
-        self.norm_type = norm_type
-        self.residual_connection = residual_connection
-
-        self.nonlinearity = nn.SiLU()
-        self.conv_inverted = ops.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
-        self.conv_depth = ops.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
-        self.conv_point = ops.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
-
-        self.norm = None
-        if norm_type == "rms_norm":
-            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.residual_connection:
-            residual = hidden_states
-
-        hidden_states = self.conv_inverted(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.conv_depth(hidden_states)
-        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
-        hidden_states = hidden_states * self.nonlinearity(gate)
-
-        hidden_states = self.conv_point(hidden_states)
-
-        if self.norm_type == "rms_norm":
-            # move channel to the last dimension so we apply RMSnorm across channel dimension
-            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
-
-        if self.residual_connection:
-            hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-def get_block(
-    block_type: str,
-    in_channels: int,
-    out_channels: int,
-    attention_head_dim: int,
-    norm_type: str,
-    act_fn: str,
-    qkv_mutliscales: tuple = (),
-):
-    if block_type == "ResBlock":
-        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
-    elif block_type == "EfficientViTBlock":
-        block = EfficientViTBlock(
-            in_channels,
-            attention_head_dim=attention_head_dim,
-            norm_type=norm_type,
-            qkv_multiscales=qkv_mutliscales
-        )
-    else:
-        raise ValueError(f"Block with {block_type=} is not supported.")
-
-    return block
-
-
-class DCDownBlock2d(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
-        super().__init__()
-
-        self.downsample = downsample
-        self.factor = 2
-        self.stride = 1 if downsample else 2
-        self.group_size = in_channels * self.factor**2 // out_channels
-        self.shortcut = shortcut
-
-        out_ratio = self.factor**2
-        if downsample:
-            assert out_channels % out_ratio == 0
-            out_channels = out_channels // out_ratio
-
-        self.conv = ops.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=self.stride,
-            padding=1,
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        x = self.conv(hidden_states)
-        if self.downsample:
-            x = F.pixel_unshuffle(x, self.factor)
-
-        if self.shortcut:
-            y = F.pixel_unshuffle(hidden_states, self.factor)
-            y = y.unflatten(1, (-1, self.group_size))
-            y = y.mean(dim=2)
-            hidden_states = x + y
-        else:
-            hidden_states = x
-
-        return hidden_states
-
-
-class DCUpBlock2d(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        interpolate: bool = False,
-        shortcut: bool = True,
-        interpolation_mode: str = "nearest",
-    ) -> None:
-        super().__init__()
-
-        self.interpolate = interpolate
-        self.interpolation_mode = interpolation_mode
-        self.shortcut = shortcut
-        self.factor = 2
-        self.repeats = out_channels * self.factor**2 // in_channels
-
-        out_ratio = self.factor**2
-        if not interpolate:
-            out_channels = out_channels * out_ratio
-
-        self.conv = ops.Conv2d(in_channels, out_channels, 3, 1, 1)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.interpolate:
-            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
-            x = self.conv(x)
-        else:
-            x = self.conv(hidden_states)
-            x = F.pixel_shuffle(x, self.factor)
-
-        if self.shortcut:
-            y = hidden_states.repeat_interleave(self.repeats, dim=1, output_size=hidden_states.shape[1] * self.repeats)
-            y = F.pixel_shuffle(y, self.factor)
-            hidden_states = x + y
-        else:
-            hidden_states = x
-
-        return hidden_states
-
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        attention_head_dim: int = 32,
-        block_type: str or tuple = "ResBlock",
-        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
-        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
-        downsample_block_type: str = "pixel_unshuffle",
-        out_shortcut: bool = True,
-    ):
-        super().__init__()
-
-        num_blocks = len(block_out_channels)
-
-        if isinstance(block_type, str):
-            block_type = (block_type,) * num_blocks
-
-        if layers_per_block[0] > 0:
-            self.conv_in = ops.Conv2d(
-                in_channels,
-                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
-                kernel_size=3,
-                stride=1,
-                padding=1,
-            )
-        else:
-            self.conv_in = DCDownBlock2d(
-                in_channels=in_channels,
-                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
-                downsample=downsample_block_type == "pixel_unshuffle",
-                shortcut=False,
-            )
-
-        down_blocks = []
-        for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
-            down_block_list = []
-
-            for _ in range(num_layers):
-                block = get_block(
-                    block_type[i],
-                    out_channel,
-                    out_channel,
-                    attention_head_dim=attention_head_dim,
-                    norm_type="rms_norm",
-                    act_fn="silu",
-                    qkv_mutliscales=qkv_multiscales[i],
-                )
-                down_block_list.append(block)
-
-            if i < num_blocks - 1 and num_layers > 0:
-                downsample_block = DCDownBlock2d(
-                    in_channels=out_channel,
-                    out_channels=block_out_channels[i + 1],
-                    downsample=downsample_block_type == "pixel_unshuffle",
-                    shortcut=True,
-                )
-                down_block_list.append(downsample_block)
-
-            down_blocks.append(nn.Sequential(*down_block_list))
-
-        self.down_blocks = nn.ModuleList(down_blocks)
-
-        self.conv_out = ops.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
-
-        self.out_shortcut = out_shortcut
-        if out_shortcut:
-            self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.conv_in(hidden_states)
-        for down_block in self.down_blocks:
-            hidden_states = down_block(hidden_states)
-
-        if self.out_shortcut:
-            x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
-            x = x.mean(dim=2)
-            hidden_states = self.conv_out(hidden_states) + x
-        else:
-            hidden_states = self.conv_out(hidden_states)
-
-        return hidden_states
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        attention_head_dim: int = 32,
-        block_type: str or tuple = "ResBlock",
-        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
-        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
-        norm_type: str or tuple = "rms_norm",
-        act_fn: str or tuple = "silu",
-        upsample_block_type: str = "pixel_shuffle",
-        in_shortcut: bool = True,
-    ):
-        super().__init__()
-
-        num_blocks = len(block_out_channels)
-
-        if isinstance(block_type, str):
-            block_type = (block_type,) * num_blocks
-        if isinstance(norm_type, str):
-            norm_type = (norm_type,) * num_blocks
-        if isinstance(act_fn, str):
-            act_fn = (act_fn,) * num_blocks
-
-        self.conv_in = ops.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
-
-        self.in_shortcut = in_shortcut
-        if in_shortcut:
-            self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
-
-        up_blocks = []
-        for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
-            up_block_list = []
-
-            if i < num_blocks - 1 and num_layers > 0:
-                upsample_block = DCUpBlock2d(
-                    block_out_channels[i + 1],
-                    out_channel,
-                    interpolate=upsample_block_type == "interpolate",
-                    shortcut=True,
-                )
-                up_block_list.append(upsample_block)
-
-            for _ in range(num_layers):
-                block = get_block(
-                    block_type[i],
-                    out_channel,
-                    out_channel,
-                    attention_head_dim=attention_head_dim,
-                    norm_type=norm_type[i],
-                    act_fn=act_fn[i],
-                    qkv_mutliscales=qkv_multiscales[i],
-                )
-                up_block_list.append(block)
-
-            up_blocks.insert(0, nn.Sequential(*up_block_list))
-
-        self.up_blocks = nn.ModuleList(up_blocks)
-
-        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
-
-        self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
-        self.conv_act = nn.ReLU()
-        self.conv_out = None
-
-        if layers_per_block[0] > 0:
-            self.conv_out = ops.Conv2d(channels, in_channels, 3, 1, 1)
-        else:
-            self.conv_out = DCUpBlock2d(
-                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
-            )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.in_shortcut:
-            x = hidden_states.repeat_interleave(
-                self.in_shortcut_repeats, dim=1, output_size=hidden_states.shape[1] * self.in_shortcut_repeats
-            )
-            hidden_states = self.conv_in(hidden_states) + x
-        else:
-            hidden_states = self.conv_in(hidden_states)
-
-        for up_block in reversed(self.up_blocks):
-            hidden_states = up_block(hidden_states)
-
-        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-
-
-class AutoencoderDC(nn.Module):
-    def __init__(
-        self,
-        in_channels: int = 2,
-        latent_channels: int = 8,
-        attention_head_dim: int = 32,
-        encoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
-        decoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
-        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
-        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
-        encoder_layers_per_block: Tuple[int] = (2, 2, 3, 3),
-        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3),
-        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
-        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
-        upsample_block_type: str = "interpolate",
-        downsample_block_type: str = "Conv",
-        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
-        decoder_act_fns: Union[str, Tuple[str]] = "silu",
-        scaling_factor: float = 0.41407,
-    ) -> None:
-        super().__init__()
-
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            latent_channels=latent_channels,
-            attention_head_dim=attention_head_dim,
-            block_type=encoder_block_types,
-            block_out_channels=encoder_block_out_channels,
-            layers_per_block=encoder_layers_per_block,
-            qkv_multiscales=encoder_qkv_multiscales,
-            downsample_block_type=downsample_block_type,
-        )
-
-        self.decoder = Decoder(
-            in_channels=in_channels,
-            latent_channels=latent_channels,
-            attention_head_dim=attention_head_dim,
-            block_type=decoder_block_types,
-            block_out_channels=decoder_block_out_channels,
-            layers_per_block=decoder_layers_per_block,
-            qkv_multiscales=decoder_qkv_multiscales,
-            norm_type=decoder_norm_types,
-            act_fn=decoder_act_fns,
-            upsample_block_type=upsample_block_type,
-        )
-
-        self.scaling_factor = scaling_factor
-        self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
-
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Internal encoding function."""
-        encoded = self.encoder(x)
-        return encoded * self.scaling_factor
-
-    def decode(self, z: torch.Tensor) -> torch.Tensor:
-        # Scale the latents back
-        z = z / self.scaling_factor
-        decoded = self.decoder(z)
-        return decoded
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        z = self.encode(x)
-        return self.decode(z)
-
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@@ -1,109 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_dcae_pipeline.py
-import torch
-from .autoencoder_dc import AutoencoderDC
-import logging
-try:
-    import torchaudio
-except:
-    logging.warning("torchaudio missing, ACE model will be broken")
-
-import torchvision.transforms as transforms
-from .music_vocoder import ADaMoSHiFiGANV1
-
-
-class MusicDCAE(torch.nn.Module):
-    def __init__(self, source_sample_rate=None, dcae_config={}, vocoder_config={}):
-        super(MusicDCAE, self).__init__()
-
-        self.dcae = AutoencoderDC(**dcae_config)
-        self.vocoder = ADaMoSHiFiGANV1(**vocoder_config)
-
-        if source_sample_rate is None:
-            self.source_sample_rate = 48000
-        else:
-            self.source_sample_rate = source_sample_rate
-
-        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
-
-        self.transform = transforms.Compose([
-            transforms.Normalize(0.5, 0.5),
-        ])
-        self.min_mel_value = -11.0
-        self.max_mel_value = 3.0
-        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
-        self.mel_chunk_size = 1024
-        self.time_dimention_multiple = 8
-        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
-        self.scale_factor = 0.1786
-        self.shift_factor = -1.9091
-
-    def load_audio(self, audio_path):
-        audio, sr = torchaudio.load(audio_path)
-        return audio, sr
-
-    def forward_mel(self, audios):
-        mels = []
-        for i in range(len(audios)):
-            image = self.vocoder.mel_transform(audios[i])
-            mels.append(image)
-        mels = torch.stack(mels)
-        return mels
-
-    @torch.no_grad()
-    def encode(self, audios, audio_lengths=None, sr=None):
-        if audio_lengths is None:
-            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
-            audio_lengths = audio_lengths.to(audios.device)
-
-        if sr is None:
-            sr = self.source_sample_rate
-
-        if sr != 44100:
-            audios = torchaudio.functional.resample(audios, sr, 44100)
-
-        max_audio_len = audios.shape[-1]
-        if max_audio_len % (8 * 512) != 0:
-            audios = torch.nn.functional.pad(audios, (0, 8 * 512 - max_audio_len % (8 * 512)))
-
-        mels = self.forward_mel(audios)
-        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
-        mels = self.transform(mels)
-        latents = []
-        for mel in mels:
-            latent = self.dcae.encoder(mel.unsqueeze(0))
-            latents.append(latent)
-        latents = torch.cat(latents, dim=0)
-        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
-        latents = (latents - self.shift_factor) * self.scale_factor
-        return latents
-        # return latents, latent_lengths
-
-    @torch.no_grad()
-    def decode(self, latents, audio_lengths=None, sr=None):
-        latents = latents / self.scale_factor + self.shift_factor
-
-        pred_wavs = []
-
-        for latent in latents:
-            mels = self.dcae.decoder(latent.unsqueeze(0))
-            mels = mels * 0.5 + 0.5
-            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
-            wav = self.vocoder.decode(mels[0]).squeeze(1)
-
-            if sr is not None:
-                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
-                wav = torchaudio.functional.resample(wav, 44100, sr)
-                # wav = resampler(wav)
-            else:
-                sr = 44100
-            pred_wavs.append(wav)
-
-        if audio_lengths is not None:
-            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
-        return torch.stack(pred_wavs)
-        # return sr, pred_wavs
-
-    def forward(self, audios, audio_lengths=None, sr=None):
-        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
-        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
-        return sr, pred_wavs, latents, latent_lengths
--- a/comfy/ldm/ace/vae/music_log_mel.py
+++ b/comfy/ldm/ace/vae/music_log_mel.py
@@ -1,113 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_log_mel.py
-import torch
-import torch.nn as nn
-from torch import Tensor
-import logging
-try:
-    from torchaudio.transforms import MelScale
-except:
-    logging.warning("torchaudio missing, ACE model will be broken")
-
-import comfy.model_management
-
-class LinearSpectrogram(nn.Module):
-    def __init__(
-        self,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        center=False,
-        mode="pow2_sqrt",
-    ):
-        super().__init__()
-
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.mode = mode
-
-        self.register_buffer("window", torch.hann_window(win_length))
-
-    def forward(self, y: Tensor) -> Tensor:
-        if y.ndim == 3:
-            y = y.squeeze(1)
-
-        y = torch.nn.functional.pad(
-            y.unsqueeze(1),
-            (
-                (self.win_length - self.hop_length) // 2,
-                (self.win_length - self.hop_length + 1) // 2,
-            ),
-            mode="reflect",
-        ).squeeze(1)
-        dtype = y.dtype
-        spec = torch.stft(
-            y.float(),
-            self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.win_length,
-            window=comfy.model_management.cast_to(self.window, dtype=torch.float32, device=y.device),
-            center=self.center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-        spec = torch.view_as_real(spec)
-
-        if self.mode == "pow2_sqrt":
-            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-        spec = spec.to(dtype)
-        return spec
-
-
-class LogMelSpectrogram(nn.Module):
-    def __init__(
-        self,
-        sample_rate=44100,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        n_mels=128,
-        center=False,
-        f_min=0.0,
-        f_max=None,
-    ):
-        super().__init__()
-
-        self.sample_rate = sample_rate
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max or sample_rate // 2
-
-        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
-        self.mel_scale = MelScale(
-            self.n_mels,
-            self.sample_rate,
-            self.f_min,
-            self.f_max,
-            self.n_fft // 2 + 1,
-            "slaney",
-            "slaney",
-        )
-
-    def compress(self, x: Tensor) -> Tensor:
-        return torch.log(torch.clamp(x, min=1e-5))
-
-    def decompress(self, x: Tensor) -> Tensor:
-        return torch.exp(x)
-
-    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
-        linear = self.spectrogram(x)
-        x = self.mel_scale(linear)
-        x = self.compress(x)
-        # print(x.shape)
-        if return_linear:
-            return x, self.compress(linear)
-
-        return x
--- a/comfy/ldm/ace/vae/music_vocoder.py
+++ b/comfy/ldm/ace/vae/music_vocoder.py
@@ -1,538 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_vocoder.py
-import torch
-from torch import nn
-
-from functools import partial
-from math import prod
-from typing import Callable, Tuple, List
-
-import numpy as np
-import torch.nn.functional as F
-from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
-
-from .music_log_mel import LogMelSpectrogram
-
-import comfy.model_management
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """  # noqa: E501
-
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
-
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
-
-
-class LayerNorm(nn.Module):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
-    with shape (batch_size, channels, height, width).
-    """  # noqa: E501
-
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(normalized_shape))
-        self.bias = nn.Parameter(torch.zeros(normalized_shape))
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape,)
-
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(
-                x, self.normalized_shape, comfy.model_management.cast_to(self.weight, dtype=x.dtype, device=x.device), comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device), self.eps
-            )
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = comfy.model_management.cast_to(self.weight[:, None], dtype=x.dtype, device=x.device) * x + comfy.model_management.cast_to(self.bias[:, None], dtype=x.dtype, device=x.device)
-            return x
-
-
-class ConvNeXtBlock(nn.Module):
-    r"""ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in PyTorch
-
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
-        kernel_size (int): Kernel size for depthwise conv. Default: 7.
-        dilation (int): Dilation for depthwise conv. Default: 1.
-    """  # noqa: E501
-
-    def __init__(
-        self,
-        dim: int,
-        drop_path: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        mlp_ratio: float = 4.0,
-        kernel_size: int = 7,
-        dilation: int = 1,
-    ):
-        super().__init__()
-
-        self.dwconv = ops.Conv1d(
-            dim,
-            dim,
-            kernel_size=kernel_size,
-            padding=int(dilation * (kernel_size - 1) / 2),
-            groups=dim,
-        )  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = ops.Linear(
-            dim, int(mlp_ratio * dim)
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = ops.Linear(int(mlp_ratio * dim), dim)
-        self.gamma = (
-            nn.Parameter(torch.empty((dim)), requires_grad=False)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x, apply_residual: bool = True):
-        input = x
-
-        x = self.dwconv(x)
-        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-
-        if self.gamma is not None:
-            x = comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) * x
-
-        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
-        x = self.drop_path(x)
-
-        if apply_residual:
-            x = input + x
-
-        return x
-
-
-class ParallelConvNeXtBlock(nn.Module):
-    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
-        super().__init__()
-        self.blocks = nn.ModuleList(
-            [
-                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
-                for kernel_size in kernel_sizes
-            ]
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return torch.stack(
-            [block(x, apply_residual=False) for block in self.blocks] + [x],
-            dim=1,
-        ).sum(dim=1)
-
-
-class ConvNeXtEncoder(nn.Module):
-    def __init__(
-        self,
-        input_channels=3,
-        depths=[3, 3, 9, 3],
-        dims=[96, 192, 384, 768],
-        drop_path_rate=0.0,
-        layer_scale_init_value=1e-6,
-        kernel_sizes: Tuple[int] = (7,),
-    ):
-        super().__init__()
-        assert len(depths) == len(dims)
-
-        self.channel_layers = nn.ModuleList()
-        stem = nn.Sequential(
-            ops.Conv1d(
-                input_channels,
-                dims[0],
-                kernel_size=7,
-                padding=3,
-                padding_mode="replicate",
-            ),
-            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
-        )
-        self.channel_layers.append(stem)
-
-        for i in range(len(depths) - 1):
-            mid_layer = nn.Sequential(
-                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
-                ops.Conv1d(dims[i], dims[i + 1], kernel_size=1),
-            )
-            self.channel_layers.append(mid_layer)
-
-        block_fn = (
-            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
-            if len(kernel_sizes) == 1
-            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
-        )
-
-        self.stages = nn.ModuleList()
-        drop_path_rates = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]
-
-        cur = 0
-        for i in range(len(depths)):
-            stage = nn.Sequential(
-                *[
-                    block_fn(
-                        dim=dims[i],
-                        drop_path=drop_path_rates[cur + j],
-                        layer_scale_init_value=layer_scale_init_value,
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            self.stages.append(stage)
-            cur += depths[i]
-
-        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
-
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        for channel_layer, stage in zip(self.channel_layers, self.stages):
-            x = channel_layer(x)
-            x = stage(x)
-
-        return self.norm(x)
-
-
-def get_padding(kernel_size, dilation=1):
-    return (kernel_size * dilation - dilation) // 2
-
-
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-
-        self.convs1 = nn.ModuleList(
-            [
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-
-        self.convs2 = nn.ModuleList(
-            [
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.silu(x)
-            xt = c1(xt)
-            xt = F.silu(xt)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for conv in self.convs1:
-            remove_weight_norm(conv)
-        for conv in self.convs2:
-            remove_weight_norm(conv)
-
-
-class HiFiGANGenerator(nn.Module):
-    def __init__(
-        self,
-        *,
-        hop_length: int = 512,
-        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
-        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
-        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
-        resblock_dilation_sizes: Tuple[Tuple[int]] = (
-            (1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 128,
-        upsample_initial_channel: int = 512,
-        use_template: bool = True,
-        pre_conv_kernel_size: int = 7,
-        post_conv_kernel_size: int = 7,
-        post_activation: Callable = partial(nn.SiLU, inplace=True),
-    ):
-        super().__init__()
-
-        assert (
-            prod(upsample_rates) == hop_length
-        ), f"hop_length must be {prod(upsample_rates)}"
-
-        self.conv_pre = torch.nn.utils.parametrizations.weight_norm(
-            ops.Conv1d(
-                num_mels,
-                upsample_initial_channel,
-                pre_conv_kernel_size,
-                1,
-                padding=get_padding(pre_conv_kernel_size),
-            )
-        )
-
-        self.num_upsamples = len(upsample_rates)
-        self.num_kernels = len(resblock_kernel_sizes)
-
-        self.noise_convs = nn.ModuleList()
-        self.use_template = use_template
-        self.ups = nn.ModuleList()
-
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            c_cur = upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-
-            if not use_template:
-                continue
-
-            if i + 1 < len(upsample_rates):
-                stride_f0 = np.prod(upsample_rates[i + 1:])
-                self.noise_convs.append(
-                    ops.Conv1d(
-                        1,
-                        c_cur,
-                        kernel_size=stride_f0 * 2,
-                        stride=stride_f0,
-                        padding=stride_f0 // 2,
-                    )
-                )
-            else:
-                self.noise_convs.append(ops.Conv1d(1, c_cur, kernel_size=1))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
-                self.resblocks.append(ResBlock1(ch, k, d))
-
-        self.activation_post = post_activation()
-        self.conv_post = torch.nn.utils.parametrizations.weight_norm(
-            ops.Conv1d(
-                ch,
-                1,
-                post_conv_kernel_size,
-                1,
-                padding=get_padding(post_conv_kernel_size),
-            )
-        )
-
-    def forward(self, x, template=None):
-        x = self.conv_pre(x)
-
-        for i in range(self.num_upsamples):
-            x = F.silu(x, inplace=True)
-            x = self.ups[i](x)
-
-            if self.use_template:
-                x = x + self.noise_convs[i](template)
-
-            xs = None
-
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-
-            x = xs / self.num_kernels
-
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
-
-    def remove_weight_norm(self):
-        for up in self.ups:
-            remove_weight_norm(up)
-        for block in self.resblocks:
-            block.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-
-
-class ADaMoSHiFiGANV1(nn.Module):
-    def __init__(
-        self,
-        input_channels: int = 128,
-        depths: List[int] = [3, 3, 9, 3],
-        dims: List[int] = [128, 256, 384, 512],
-        drop_path_rate: float = 0.0,
-        kernel_sizes: Tuple[int] = (7,),
-        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
-        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
-        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
-        resblock_dilation_sizes: Tuple[Tuple[int]] = (
-            (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 512,
-        upsample_initial_channel: int = 1024,
-        use_template: bool = False,
-        pre_conv_kernel_size: int = 13,
-        post_conv_kernel_size: int = 13,
-        sampling_rate: int = 44100,
-        n_fft: int = 2048,
-        win_length: int = 2048,
-        hop_length: int = 512,
-        f_min: int = 40,
-        f_max: int = 16000,
-        n_mels: int = 128,
-    ):
-        super().__init__()
-
-        self.backbone = ConvNeXtEncoder(
-            input_channels=input_channels,
-            depths=depths,
-            dims=dims,
-            drop_path_rate=drop_path_rate,
-            kernel_sizes=kernel_sizes,
-        )
-
-        self.head = HiFiGANGenerator(
-            hop_length=hop_length,
-            upsample_rates=upsample_rates,
-            upsample_kernel_sizes=upsample_kernel_sizes,
-            resblock_kernel_sizes=resblock_kernel_sizes,
-            resblock_dilation_sizes=resblock_dilation_sizes,
-            num_mels=num_mels,
-            upsample_initial_channel=upsample_initial_channel,
-            use_template=use_template,
-            pre_conv_kernel_size=pre_conv_kernel_size,
-            post_conv_kernel_size=post_conv_kernel_size,
-        )
-        self.sampling_rate = sampling_rate
-        self.mel_transform = LogMelSpectrogram(
-            sample_rate=sampling_rate,
-            n_fft=n_fft,
-            win_length=win_length,
-            hop_length=hop_length,
-            f_min=f_min,
-            f_max=f_max,
-            n_mels=n_mels,
-        )
-        self.eval()
-
-    @torch.no_grad()
-    def decode(self, mel):
-        y = self.backbone(mel)
-        y = self.head(y)
-        return y
-
-    @torch.no_grad()
-    def encode(self, x):
-        return self.mel_transform(x)
-
-    def forward(self, mel):
-        y = self.backbone(mel)
-        y = self.head(y)
-        return y
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@@ -75,10 +75,16 @@ class SnakeBeta(nn.Module):
        return x

 def WNConv1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+    try:
+        return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+    except:
+        return torch.nn.utils.weight_norm(ops.Conv1d(*args, **kwargs)) #support pytorch 2.1 and older

 def WNConvTranspose1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+    try:
+        return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+    except:
+        return torch.nn.utils.weight_norm(ops.ConvTranspose1d(*args, **kwargs)) #support pytorch 2.1 and older

 def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
    if activation == "elu":
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@@ -9,7 +9,6 @@ import torch.nn.functional as F

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ops
-import comfy.patcher_extension
 import comfy.ldm.common_dit

 def modulate(x, shift, scale):
@@ -437,13 +436,6 @@ class MMDiT(nn.Module):
        return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1])

    def forward(self, x, timestep, context, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})
        # patchify x, add PE
        b, c, h, w = x.shape
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@@ -19,10 +19,6 @@
 import torch
 from torch import nn
 from torch.autograd import Function
-import comfy.ops
-
-ops = comfy.ops.disable_weight_init
-

 class vector_quantize(Function):
    @staticmethod
@@ -125,15 +121,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            ops.Conv2d(c, c, kernel_size=3, groups=c)
+            nn.Conv2d(c, c, kernel_size=3, groups=c)
        )

        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            ops.Linear(c, c_hidden),
+            nn.Linear(c, c_hidden),
            nn.GELU(),
-            ops.Linear(c_hidden, c),
+            nn.Linear(c_hidden, c),
        )

        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@@ -175,16 +171,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@@ -195,7 +191,7 @@ class StageA(nn.Module):

        # Decoder blocks
        up_blocks = [nn.Sequential(
-            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@@ -203,11 +199,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )

@@ -236,17 +232,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()

    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@@ -19,9 +19,6 @@ import torch
 import torchvision
 from torch import nn

-import comfy.ops
-
-ops = comfy.ops.disable_weight_init

 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@@ -29,7 +26,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@@ -37,7 +34,7 @@ class EfficientNetEncoder(nn.Module):

    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
+        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
        o = self.mapper(self.backbone(x))
        return o

@@ -47,39 +44,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )

    def forward(self, x):
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@@ -1,181 +0,0 @@
-import torch
-from torch import Tensor, nn
-
-from comfy.ldm.flux.math import attention
-from comfy.ldm.flux.layers import (
-    MLPEmbedder,
-    RMSNorm,
-    QKNorm,
-    SelfAttention,
-    ModulationOut,
-)
-
-
-
-class ChromaModulationOut(ModulationOut):
-    @classmethod
-    def from_offset(cls, tensor: torch.Tensor, offset: int = 0) -> ModulationOut:
-        return cls(
-            shift=tensor[:, offset : offset + 1, :],
-            scale=tensor[:, offset + 1 : offset + 2, :],
-            gate=tensor[:, offset + 2 : offset + 3, :],
-        )
-
-
-
-
-class Approximator(nn.Module):
-    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
-        self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)
-
-    @property
-    def device(self):
-        # Get the device of the module (assumes all parameters are on the same device)
-        return next(self.parameters()).device
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.in_proj(x)
-
-        for layer, norms in zip(self.layers, self.norms):
-            x = x + layer(norms(x))
-
-        x = self.out_proj(x)
-
-        return x
-
-
-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-        self.flipped_img_txt = flipped_img_txt
-
-    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None):
-        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
-
-        # prepare image for attention
-        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        # prepare txt for attention
-        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        # run actual attention
-        attn = attention(torch.cat((txt_q, img_q), dim=2),
-                         torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2),
-                         pe=pe, mask=attn_mask)
-
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-        # calculate the img bloks
-        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
-        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
-
-        # calculate the txt bloks
-        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
-        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
-
-        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
-
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float = None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
-        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
-
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-
-    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
-        mod = vec
-        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x.addcmul_(mod.gate, output)
-        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-        return x
-
-
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
-
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = vec
-        shift = shift.squeeze(1)
-        scale = scale.squeeze(1)
-        x = torch.addcmul(shift[:, None, :], 1 + scale[:, None, :], self.norm_final(x))
-        x = self.linear(x)
-        return x
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -1,278 +0,0 @@
-#Original code can be found on: https://github.com/black-forest-labs/flux
-
-from dataclasses import dataclass
-
-import torch
-from torch import Tensor, nn
-from einops import rearrange, repeat
-import comfy.patcher_extension
-import comfy.ldm.common_dit
-
-from comfy.ldm.flux.layers import (
-    EmbedND,
-    timestep_embedding,
-)
-
-from .layers import (
-    DoubleStreamBlock,
-    LastLayer,
-    SingleStreamBlock,
-    Approximator,
-    ChromaModulationOut,
-)
-
-
-@dataclass
-class ChromaParams:
-    in_channels: int
-    out_channels: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list
-    theta: int
-    patch_size: int
-    qkv_bias: bool
-    in_dim: int
-    out_dim: int
-    hidden_dim: int
-    n_layers: int
-
-
-
-
-class Chroma(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-
-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        self.dtype = dtype
-        params = ChromaParams(**kwargs)
-        self.params = params
-        self.patch_size = params.patch_size
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.in_dim = params.in_dim
-        self.out_dim = params.out_dim
-        self.hidden_dim = params.hidden_dim
-        self.n_layers = params.n_layers
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
-        # set as nn identity for now, will overwrite it later.
-        self.distilled_guidance_layer = Approximator(
-                    in_dim=self.in_dim,
-                    hidden_dim=self.hidden_dim,
-                    out_dim=self.out_dim,
-                    n_layers=self.n_layers,
-                    dtype=dtype, device=device, operations=operations
-                )
-
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-        self.skip_mmdit = []
-        self.skip_dit = []
-        self.lite = False
-
-    def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0):
-        # This function slices up the modulations tensor which has the following layout:
-        #   single     : num_single_blocks * 3 elements
-        #   double_img : num_double_blocks * 6 elements
-        #   double_txt : num_double_blocks * 6 elements
-        #   final      : 2 elements
-        if block_type == "final":
-            return (tensor[:, -2:-1, :], tensor[:, -1:, :])
-        single_block_count = self.params.depth_single_blocks
-        double_block_count = self.params.depth
-        offset = 3 * idx
-        if block_type == "single":
-            return ChromaModulationOut.from_offset(tensor, offset)
-        # Double block modulations are 6 elements so we double 3 * idx.
-        offset *= 2
-        if block_type in {"double_img", "double_txt"}:
-            # Advance past the single block modulations.
-            offset += 3 * single_block_count
-            if block_type == "double_txt":
-                # Advance past the double block img modulations.
-                offset += 6 * double_block_count
-            return (
-                ChromaModulationOut.from_offset(tensor, offset),
-                ChromaModulationOut.from_offset(tensor, offset + 3),
-            )
-        raise ValueError("Bad block_type")
-
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        guidance: Tensor = None,
-        control = None,
-        transformer_options={},
-        attn_mask: Tensor = None,
-    ) -> Tensor:
-        patches_replace = transformer_options.get("patches_replace", {})
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-
-        # distilled vector guidance
-        mod_index_length = 344
-        distill_timestep = timestep_embedding(timesteps.detach().clone(), 16).to(img.device, img.dtype)
-        # guidance = guidance *
-        distil_guidance = timestep_embedding(guidance.detach().clone(), 16).to(img.device, img.dtype)
-
-        # get all modulation index
-        modulation_index = timestep_embedding(torch.arange(mod_index_length, device=img.device), 32).to(img.device, img.dtype)
-        # we need to broadcast the modulation index here so each batch has all of the index
-        modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype)
-        # and we need to broadcast timestep and guidance along too
-        timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype)
-        # then and only then we could concatenate it together
-        input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype)
-
-        mod_vectors = self.distilled_guidance_layer(input_vec)
-
-        txt = self.txt_in(txt)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if i not in self.skip_mmdit:
-                double_mod = (
-                    self.get_modulations(mod_vectors, "double_img", idx=i),
-                    self.get_modulations(mod_vectors, "double_txt", idx=i),
-                )
-                if ("double_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["img"], out["txt"] = block(img=args["img"],
-                                                       txt=args["txt"],
-                                                       vec=args["vec"],
-                                                       pe=args["pe"],
-                                                       attn_mask=args.get("attn_mask"))
-                        return out
-
-                    out = blocks_replace[("double_block", i)]({"img": img,
-                                                               "txt": txt,
-                                                               "vec": double_mod,
-                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
-                                                              {"original_block": block_wrap})
-                    txt = out["txt"]
-                    img = out["img"]
-                else:
-                    img, txt = block(img=img,
-                                     txt=txt,
-                                     vec=double_mod,
-                                     pe=pe,
-                                     attn_mask=attn_mask)
-
-                if control is not None: # Controlnet
-                    control_i = control.get("input")
-                    if i < len(control_i):
-                        add = control_i[i]
-                        if add is not None:
-                            img += add
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if i not in self.skip_dit:
-                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
-                if ("single_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["img"] = block(args["img"],
-                                           vec=args["vec"],
-                                           pe=args["pe"],
-                                           attn_mask=args.get("attn_mask"))
-                        return out
-
-                    out = blocks_replace[("single_block", i)]({"img": img,
-                                                               "vec": single_mod,
-                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
-                                                              {"original_block": block_wrap})
-                    img = out["img"]
-                else:
-                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask)
-
-                if control is not None: # Controlnet
-                    control_o = control.get("output")
-                    if i < len(control_o):
-                        add = control_o[i]
-                        if add is not None:
-                            img[:, txt.shape[1] :, ...] += add
-
-        img = img[:, txt.shape[1] :, ...]
-        final_mod = self.get_modulations(mod_vectors, "final")
-        img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
-        return img
-
-    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, guidance, control, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
-        bs, c, h, w = x.shape
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size)
-
-        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
-        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h,:w]
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -1,6 +1,5 @@
 import torch
-import comfy.rmsnorm
-
+import comfy.ops

 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
@@ -12,5 +11,20 @@ def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):

    return torch.nn.functional.pad(img, pad, mode=padding_mode)

+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+except:
+    rms_norm_torch = None

-rms_norm = comfy.rmsnorm.rms_norm
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@@ -23,14 +23,25 @@ from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
 from torch import nn

+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
 from comfy.ldm.modules.attention import optimized_attention


-def get_normalization(name: str, channels: int, weight_args={}, operations=None):
+def apply_rotary_pos_emb(
+    t: torch.Tensor,
+    freqs: torch.Tensor,
+) -> torch.Tensor:
+    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
+    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
+    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
+    return t_out
+
+
+def get_normalization(name: str, channels: int, weight_args={}):
    if name == "I":
        return nn.Identity()
    elif name == "R":
-        return operations.RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
+        return RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
    else:
        raise ValueError(f"Normalization {name} not found")

@@ -109,15 +120,15 @@ class Attention(nn.Module):

        self.to_q = nn.Sequential(
            operations.Linear(query_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[0], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[0], norm_dim),
        )
        self.to_k = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[1], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[1], norm_dim),
        )
        self.to_v = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[2], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[2], norm_dim),
        )

        self.to_out = nn.Sequential(
@@ -157,19 +168,15 @@ class Attention(nn.Module):
        k = self.to_k[1](k)
        v = self.to_v[1](v)
        if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
-            # apply_rotary_pos_emb inlined
-            q_shape = q.shape
-            q = q.reshape(*q.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            q = rope_emb[..., 0] * q[..., 0] + rope_emb[..., 1] * q[..., 1]
-            q = q.movedim(-1, -2).reshape(*q_shape).to(x.dtype)
-
-            # apply_rotary_pos_emb inlined
-            k_shape = k.shape
-            k = k.reshape(*k.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            k = rope_emb[..., 0] * k[..., 0] + rope_emb[..., 1] * k[..., 1]
-            k = k.movedim(-1, -2).reshape(*k_shape).to(x.dtype)
+            q = apply_rotary_pos_emb(q, rope_emb)
+            k = apply_rotary_pos_emb(k, rope_emb)
        return q, k, v

+    def cal_attn(self, q, k, v, mask=None):
+        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
+        out = rearrange(out, " b n s c -> s b (n c)")
+        return self.to_out(out)
+
    def forward(
        self,
        x,
@@ -184,10 +191,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
-        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
-        del q, k, v
-        out = rearrange(out, " b n s c -> s b (n c)")
-        return self.to_out(out)
+        return self.cal_attn(q, k, v, mask)


 class FeedForward(nn.Module):
@@ -784,7 +788,10 @@ class GeneralDITTransformerBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
+        if extra_per_block_pos_emb is not None:
+            x = x + extra_per_block_pos_emb
        for block in self.blocks:
            x = block(
                x,
--- a/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
@@ -30,8 +30,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 import logging

-from comfy.ldm.modules.diffusionmodules.model import vae_attention
-
 from .patching import (
    Patcher,
    Patcher3D,
@@ -402,8 +400,6 @@ class CausalAttnBlock(nn.Module):
            in_channels, in_channels, kernel_size=1, stride=1, padding=0
        )

-        self.optimized_attention = vae_attention()
-
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h_ = x
        h_ = self.norm(h_)
@@ -417,7 +413,18 @@ class CausalAttnBlock(nn.Module):
        v, batch_size = time2batch(v)

        b, c, h, w = q.shape
-        h_ = self.optimized_attention(q, k, v)
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)
+        k = k.reshape(b, c, h * w)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)

        h_ = batch2time(h_, batch_size)
        h_ = self.proj_out(h_)
@@ -864,16 +871,18 @@ class EncoderFactorized(nn.Module):
        x = self.patcher3d(x)

        # downsampling
-        h = self.conv_in(x)
+        hs = [self.conv_in(x)]
        for i_level in range(self.num_resolutions):
            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h)
+                h = self.down[i_level].block[i_block](hs[-1])
                if len(self.down[i_level].attn) > 0:
                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
            if i_level != self.num_resolutions - 1:
-                h = self.down[i_level].downsample(h)
+                hs.append(self.down[i_level].downsample(hs[-1]))

        # middle
+        h = hs[-1]
        h = self.mid.block_1(h)
        h = self.mid.attn_1(h)
        h = self.mid.block_2(h)
--- a/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
@@ -281,76 +281,54 @@ class UnPatcher3D(UnPatcher):
        hh = hh.to(dtype=dtype)

        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
-        del x

        # Height height transposed convolutions.
        xll = F.conv_transpose3d(
            xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlll
-
        xll += F.conv_transpose3d(
            xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xllh

        xlh = F.conv_transpose3d(
            xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlhl
-
        xlh += F.conv_transpose3d(
            xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlhh

        xhl = F.conv_transpose3d(
            xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhll
-
        xhl += F.conv_transpose3d(
            xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhlh

        xhh = F.conv_transpose3d(
            xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhhl
-
        xhh += F.conv_transpose3d(
            xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhhh

        # Handles width transposed convolutions.
        xl = F.conv_transpose3d(
            xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xll
-
        xl += F.conv_transpose3d(
            xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xlh
-
        xh = F.conv_transpose3d(
            xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xhl
-
        xh += F.conv_transpose3d(
            xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xhh

        # Handles time axis transposed convolutions.
        x = F.conv_transpose3d(
            xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
        )
-        del xl
-
        x += F.conv_transpose3d(
            xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
        )
--- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
@@ -58,8 +58,7 @@ def is_odd(n: int) -> bool:


 def nonlinearity(x):
-    # x * sigmoid(x)
-    return torch.nn.functional.silu(x)
+    return x * torch.sigmoid(x)


 def Normalize(in_channels, num_groups=32):
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@@ -27,7 +27,7 @@ from torchvision import transforms
 from enum import Enum
 import logging

-import comfy.patcher_extension
+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm

 from .blocks import (
    FinalLayer,
@@ -168,7 +168,7 @@ class GeneralDIT(nn.Module):
            operations=operations,
        )

-        self.build_pos_embed(device=device, dtype=dtype)
+        self.build_pos_embed(device=device)
        self.block_x_format = block_x_format
        self.use_adaln_lora = use_adaln_lora
        self.adaln_lora_dim = adaln_lora_dim
@@ -195,7 +195,7 @@ class GeneralDIT(nn.Module):

        if self.affline_emb_norm:
            logging.debug("Building affine embedding normalization layer")
-            self.affline_norm = operations.RMSNorm(model_channels, elementwise_affine=True, eps=1e-6, device=device, dtype=dtype)
+            self.affline_norm = RMSNorm(model_channels, elementwise_affine=True, eps=1e-6)
        else:
            self.affline_norm = nn.Identity()

@@ -210,7 +210,7 @@ class GeneralDIT(nn.Module):
            operations=operations,
        )

-    def build_pos_embed(self, device=None, dtype=None):
+    def build_pos_embed(self, device=None):
        if self.pos_emb_cls == "rope3d":
            cls_type = VideoRopePosition3DEmb
        else:
@@ -242,7 +242,6 @@ class GeneralDIT(nn.Module):
            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
            kwargs["device"] = device
-            kwargs["dtype"] = dtype
            self.extra_pos_embedder = LearnablePosEmbAxis(
                **kwargs,
            )
@@ -293,7 +292,7 @@ class GeneralDIT(nn.Module):
        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)

        if self.extra_per_block_abs_pos_emb:
-            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
+            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device)
        else:
            extra_pos_emb = None

@@ -437,42 +436,6 @@ class GeneralDIT(nn.Module):
        latent_condition_sigma: Optional[torch.Tensor] = None,
        condition_video_augment_sigma: Optional[torch.Tensor] = None,
        **kwargs,
-    ):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {}))
-        ).execute(x,
-                timesteps,
-                context,
-                attention_mask,
-                fps,
-                image_size,
-                padding_mask,
-                scalar_feature,
-                data_type,
-                latent_condition,
-                latent_condition_sigma,
-                condition_video_augment_sigma,
-                **kwargs)
-
-    def _forward(
-        self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        context: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        # crossattn_emb: torch.Tensor,
-        # crossattn_mask: Optional[torch.Tensor] = None,
-        fps: Optional[torch.Tensor] = None,
-        image_size: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        scalar_feature: Optional[torch.Tensor] = None,
-        data_type: Optional[DataType] = DataType.VIDEO,
-        latent_condition: Optional[torch.Tensor] = None,
-        latent_condition_sigma: Optional[torch.Tensor] = None,
-        condition_video_augment_sigma: Optional[torch.Tensor] = None,
-        **kwargs,
    ):
        """
        Args:
@@ -513,8 +476,6 @@ class GeneralDIT(nn.Module):
            inputs["original_shape"],
        )
        extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = inputs["extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D"].to(x.dtype)
-        del inputs
-
        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
            assert (
                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
@@ -525,8 +486,6 @@ class GeneralDIT(nn.Module):
                self.blocks["block0"].x_format == block.x_format
            ), f"First block has x_format {self.blocks[0].x_format}, got {block.x_format}"

-            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-                x += extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D
            x = block(
                x,
                affline_emb_B_D,
@@ -534,6 +493,7 @@ class GeneralDIT(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
+                extra_per_block_pos_emb=extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
            )

        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@@ -41,12 +41,12 @@ def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0)


 class VideoPositionEmb(nn.Module):
-    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None) -> torch.Tensor:
        """
        It delegates the embedding generation to generate_embeddings function.
        """
        B_T_H_W_C = x_B_T_H_W_C.shape
-        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype)
+        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device)

        return embeddings

@@ -66,16 +66,15 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        h_extrapolation_ratio: float = 1.0,
        w_extrapolation_ratio: float = 1.0,
        t_extrapolation_ratio: float = 1.0,
-        enable_fps_modulation: bool = True,
        device=None,
        **kwargs,  # used for compatibility with other positional embeddings; unused in this class
    ):
        del kwargs
        super().__init__()
+        self.register_buffer("seq", torch.arange(max(len_h, len_w, len_t), dtype=torch.float, device=device))
        self.base_fps = base_fps
        self.max_h = len_h
        self.max_w = len_w
-        self.enable_fps_modulation = enable_fps_modulation

        dim = head_dim
        dim_h = dim // 6 * 2
@@ -105,7 +104,6 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        w_ntk_factor: Optional[float] = None,
        t_ntk_factor: Optional[float] = None,
        device=None,
-        dtype=None,
    ):
        """
        Generate embeddings for the given input size.
@@ -133,19 +131,21 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))

        B, T, H, W, _ = B_T_H_W_C
-        seq = torch.arange(max(H, W, T), dtype=torch.float, device=device)
        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
        assert (
            uniform_fps or B == 1 or T == 1
        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
-        half_emb_h = torch.outer(seq[:H].to(device=device), h_spatial_freqs)
-        half_emb_w = torch.outer(seq[:W].to(device=device), w_spatial_freqs)
+        assert (
+            H <= self.max_h and W <= self.max_w
+        ), f"Input dimensions (H={H}, W={W}) exceed the maximum dimensions (max_h={self.max_h}, max_w={self.max_w})"
+        half_emb_h = torch.outer(self.seq[:H].to(device=device), h_spatial_freqs)
+        half_emb_w = torch.outer(self.seq[:W].to(device=device), w_spatial_freqs)

        # apply sequence scaling in temporal dimension
-        if fps is None or self.enable_fps_modulation is False:  # image case
-            half_emb_t = torch.outer(seq[:T].to(device=device), temporal_freqs)
+        if fps is None:  # image case
+            half_emb_t = torch.outer(self.seq[:T].to(device=device), temporal_freqs)
        else:
-            half_emb_t = torch.outer(seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
+            half_emb_t = torch.outer(self.seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)

        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
@@ -173,7 +173,6 @@ class LearnablePosEmbAxis(VideoPositionEmb):
        len_w: int,
        len_t: int,
        device=None,
-        dtype=None,
        **kwargs,
    ):
        """
@@ -185,16 +184,17 @@ class LearnablePosEmbAxis(VideoPositionEmb):
        self.interpolation = interpolation
        assert self.interpolation in ["crop"], f"Unknown interpolation method {self.interpolation}"

-        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device, dtype=dtype))
-        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype))
-        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype))
+        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device))
+        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device))
+        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device))

-    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None) -> torch.Tensor:
        B, T, H, W, _ = B_T_H_W_C
        if self.interpolation == "crop":
-            emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype)
-            emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype)
-            emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype)
+            emb_h_H = self.pos_emb_h[:H].to(device=device)
+            emb_w_W = self.pos_emb_w[:W].to(device=device)
+            emb_t_T = self.pos_emb_t[:T].to(device=device)
            emb = (
                repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W)
                + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W)
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@@ -1,879 +0,0 @@
-# original code from: https://github.com/nvidia-cosmos/cosmos-predict2
-
-import torch
-from torch import nn
-from einops import rearrange
-from einops.layers.torch import Rearrange
-import logging
-from typing import Callable, Optional, Tuple
-import math
-
-from .position_embedding import VideoRopePosition3DEmb, LearnablePosEmbAxis
-from torchvision import transforms
-
-import comfy.patcher_extension
-from comfy.ldm.modules.attention import optimized_attention
-
-def apply_rotary_pos_emb(
-    t: torch.Tensor,
-    freqs: torch.Tensor,
-) -> torch.Tensor:
-    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
-    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
-    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
-    return t_out
-
-
-# ---------------------- Feed Forward Network -----------------------
-class GPT2FeedForward(nn.Module):
-    def __init__(self, d_model: int, d_ff: int, device=None, dtype=None, operations=None) -> None:
-        super().__init__()
-        self.activation = nn.GELU()
-        self.layer1 = operations.Linear(d_model, d_ff, bias=False, device=device, dtype=dtype)
-        self.layer2 = operations.Linear(d_ff, d_model, bias=False, device=device, dtype=dtype)
-
-        self._layer_id = None
-        self._dim = d_model
-        self._hidden_dim = d_ff
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.layer1(x)
-
-        x = self.activation(x)
-        x = self.layer2(x)
-        return x
-
-
-def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor) -> torch.Tensor:
-    """Computes multi-head attention using PyTorch's native implementation.
-
-    This function provides a PyTorch backend alternative to Transformer Engine's attention operation.
-    It rearranges the input tensors to match PyTorch's expected format, computes scaled dot-product
-    attention, and rearranges the output back to the original format.
-
-    The input tensor names use the following dimension conventions:
-
-    - B: batch size
-    - S: sequence length
-    - H: number of attention heads
-    - D: head dimension
-
-    Args:
-        q_B_S_H_D: Query tensor with shape (batch, seq_len, n_heads, head_dim)
-        k_B_S_H_D: Key tensor with shape (batch, seq_len, n_heads, head_dim)
-        v_B_S_H_D: Value tensor with shape (batch, seq_len, n_heads, head_dim)
-
-    Returns:
-        Attention output tensor with shape (batch, seq_len, n_heads * head_dim)
-    """
-    in_q_shape = q_B_S_H_D.shape
-    in_k_shape = k_B_S_H_D.shape
-    q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1])
-    k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
-    v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
-    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True)
-
-
-class Attention(nn.Module):
-    """
-    A flexible attention module supporting both self-attention and cross-attention mechanisms.
-
-    This module implements a multi-head attention layer that can operate in either self-attention
-    or cross-attention mode. The mode is determined by whether a context dimension is provided.
-    The implementation uses scaled dot-product attention and supports optional bias terms and
-    dropout regularization.
-
-    Args:
-        query_dim (int): The dimensionality of the query vectors.
-        context_dim (int, optional): The dimensionality of the context (key/value) vectors.
-            If None, the module operates in self-attention mode using query_dim. Default: None
-        n_heads (int, optional): Number of attention heads for multi-head attention. Default: 8
-        head_dim (int, optional): The dimension of each attention head. Default: 64
-        dropout (float, optional): Dropout probability applied to the output. Default: 0.0
-        qkv_format (str, optional): Format specification for QKV tensors. Default: "bshd"
-        backend (str, optional): Backend to use for the attention operation. Default: "transformer_engine"
-
-    Examples:
-        >>> # Self-attention with 512 dimensions and 8 heads
-        >>> self_attn = Attention(query_dim=512)
-        >>> x = torch.randn(32, 16, 512)  # (batch_size, seq_len, dim)
-        >>> out = self_attn(x)  # (32, 16, 512)
-
-        >>> # Cross-attention
-        >>> cross_attn = Attention(query_dim=512, context_dim=256)
-        >>> query = torch.randn(32, 16, 512)
-        >>> context = torch.randn(32, 8, 256)
-        >>> out = cross_attn(query, context)  # (32, 16, 512)
-    """
-
-    def __init__(
-        self,
-        query_dim: int,
-        context_dim: Optional[int] = None,
-        n_heads: int = 8,
-        head_dim: int = 64,
-        dropout: float = 0.0,
-        device=None,
-        dtype=None,
-        operations=None,
-    ) -> None:
-        super().__init__()
-        logging.debug(
-            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
-            f"{n_heads} heads with a dimension of {head_dim}."
-        )
-        self.is_selfattn = context_dim is None  # self attention
-
-        context_dim = query_dim if context_dim is None else context_dim
-        inner_dim = head_dim * n_heads
-
-        self.n_heads = n_heads
-        self.head_dim = head_dim
-        self.query_dim = query_dim
-        self.context_dim = context_dim
-
-        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
-        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
-
-        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
-        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
-
-        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
-        self.v_norm = nn.Identity()
-
-        self.output_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
-        self.output_dropout = nn.Dropout(dropout) if dropout > 1e-4 else nn.Identity()
-
-        self.attn_op = torch_attention_op
-
-        self._query_dim = query_dim
-        self._context_dim = context_dim
-        self._inner_dim = inner_dim
-
-    def compute_qkv(
-        self,
-        x: torch.Tensor,
-        context: Optional[torch.Tensor] = None,
-        rope_emb: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        q = self.q_proj(x)
-        context = x if context is None else context
-        k = self.k_proj(context)
-        v = self.v_proj(context)
-        q, k, v = map(
-            lambda t: rearrange(t, "b ... (h d) -> b ... h d", h=self.n_heads, d=self.head_dim),
-            (q, k, v),
-        )
-
-        def apply_norm_and_rotary_pos_emb(
-            q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, rope_emb: Optional[torch.Tensor]
-        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            q = self.q_norm(q)
-            k = self.k_norm(k)
-            v = self.v_norm(v)
-            if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
-                q = apply_rotary_pos_emb(q, rope_emb)
-                k = apply_rotary_pos_emb(k, rope_emb)
-            return q, k, v
-
-        q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb)
-
-        return q, k, v
-
-    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        result = self.attn_op(q, k, v)  # [B, S, H, D]
-        return self.output_dropout(self.output_proj(result))
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        context: Optional[torch.Tensor] = None,
-        rope_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x (Tensor): The query tensor of shape [B, Mq, K]
-            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
-        """
-        q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb)
-        return self.compute_attention(q, k, v)
-
-
-class Timesteps(nn.Module):
-    def __init__(self, num_channels: int):
-        super().__init__()
-        self.num_channels = num_channels
-
-    def forward(self, timesteps_B_T: torch.Tensor) -> torch.Tensor:
-        assert timesteps_B_T.ndim == 2, f"Expected 2D input, got {timesteps_B_T.ndim}"
-        timesteps = timesteps_B_T.flatten().float()
-        half_dim = self.num_channels // 2
-        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
-        exponent = exponent / (half_dim - 0.0)
-
-        emb = torch.exp(exponent)
-        emb = timesteps[:, None].float() * emb[None, :]
-
-        sin_emb = torch.sin(emb)
-        cos_emb = torch.cos(emb)
-        emb = torch.cat([cos_emb, sin_emb], dim=-1)
-
-        return rearrange(emb, "(b t) d -> b t d", b=timesteps_B_T.shape[0], t=timesteps_B_T.shape[1])
-
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False, device=None, dtype=None, operations=None):
-        super().__init__()
-        logging.debug(
-            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
-        )
-        self.in_dim = in_features
-        self.out_dim = out_features
-        self.linear_1 = operations.Linear(in_features, out_features, bias=not use_adaln_lora, device=device, dtype=dtype)
-        self.activation = nn.SiLU()
-        self.use_adaln_lora = use_adaln_lora
-        if use_adaln_lora:
-            self.linear_2 = operations.Linear(out_features, 3 * out_features, bias=False, device=device, dtype=dtype)
-        else:
-            self.linear_2 = operations.Linear(out_features, out_features, bias=False, device=device, dtype=dtype)
-
-    def forward(self, sample: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        emb = self.linear_1(sample)
-        emb = self.activation(emb)
-        emb = self.linear_2(emb)
-
-        if self.use_adaln_lora:
-            adaln_lora_B_T_3D = emb
-            emb_B_T_D = sample
-        else:
-            adaln_lora_B_T_3D = None
-            emb_B_T_D = emb
-
-        return emb_B_T_D, adaln_lora_B_T_3D
-
-
-class PatchEmbed(nn.Module):
-    """
-    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
-    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
-    making it suitable for video and image processing tasks. It supports dividing the input into patches
-    and embedding each patch into a vector of size `out_channels`.
-
-    Parameters:
-    - spatial_patch_size (int): The size of each spatial patch.
-    - temporal_patch_size (int): The size of each temporal patch.
-    - in_channels (int): Number of input channels. Default: 3.
-    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
-    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
-    """
-
-    def __init__(
-        self,
-        spatial_patch_size: int,
-        temporal_patch_size: int,
-        in_channels: int = 3,
-        out_channels: int = 768,
-        device=None, dtype=None, operations=None
-    ):
-        super().__init__()
-        self.spatial_patch_size = spatial_patch_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.proj = nn.Sequential(
-            Rearrange(
-                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
-                r=temporal_patch_size,
-                m=spatial_patch_size,
-                n=spatial_patch_size,
-            ),
-            operations.Linear(
-                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=False, device=device, dtype=dtype
-            ),
-        )
-        self.dim = in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the PatchEmbed module.
-
-        Parameters:
-        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
-            B is the batch size,
-            C is the number of channels,
-            T is the temporal dimension,
-            H is the height, and
-            W is the width of the input.
-
-        Returns:
-        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
-        """
-        assert x.dim() == 5
-        _, _, T, H, W = x.shape
-        assert (
-            H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
-        ), f"H,W {(H, W)} should be divisible by spatial_patch_size {self.spatial_patch_size}"
-        assert T % self.temporal_patch_size == 0
-        x = self.proj(x)
-        return x
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of video DiT.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        spatial_patch_size: int,
-        temporal_patch_size: int,
-        out_channels: int,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        device=None, dtype=None, operations=None
-    ):
-        super().__init__()
-        self.layer_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = operations.Linear(
-            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype
-        )
-        self.hidden_size = hidden_size
-        self.n_adaln_chunks = 2
-        self.use_adaln_lora = use_adaln_lora
-        self.adaln_lora_dim = adaln_lora_dim
-        if use_adaln_lora:
-            self.adaln_modulation = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(hidden_size, adaln_lora_dim, bias=False, device=device, dtype=dtype),
-                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype),
-            )
-        else:
-            self.adaln_modulation = nn.Sequential(
-                nn.SiLU(), operations.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype)
-            )
-
-    def forward(
-        self,
-        x_B_T_H_W_D: torch.Tensor,
-        emb_B_T_D: torch.Tensor,
-        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
-    ):
-        if self.use_adaln_lora:
-            assert adaln_lora_B_T_3D is not None
-            shift_B_T_D, scale_B_T_D = (
-                self.adaln_modulation(emb_B_T_D) + adaln_lora_B_T_3D[:, :, : 2 * self.hidden_size]
-            ).chunk(2, dim=-1)
-        else:
-            shift_B_T_D, scale_B_T_D = self.adaln_modulation(emb_B_T_D).chunk(2, dim=-1)
-
-        shift_B_T_1_1_D, scale_B_T_1_1_D = rearrange(shift_B_T_D, "b t d -> b t 1 1 d"), rearrange(
-            scale_B_T_D, "b t d -> b t 1 1 d"
-        )
-
-        def _fn(
-            _x_B_T_H_W_D: torch.Tensor,
-            _norm_layer: nn.Module,
-            _scale_B_T_1_1_D: torch.Tensor,
-            _shift_B_T_1_1_D: torch.Tensor,
-        ) -> torch.Tensor:
-            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
-
-        x_B_T_H_W_D = _fn(x_B_T_H_W_D, self.layer_norm, scale_B_T_1_1_D, shift_B_T_1_1_D)
-        x_B_T_H_W_O = self.linear(x_B_T_H_W_D)
-        return x_B_T_H_W_O
-
-
-class Block(nn.Module):
-    """
-    A transformer block that combines self-attention, cross-attention and MLP layers with AdaLN modulation.
-    Each component (self-attention, cross-attention, MLP) has its own layer normalization and AdaLN modulation.
-
-    Parameters:
-        x_dim (int): Dimension of input features
-        context_dim (int): Dimension of context features for cross-attention
-        num_heads (int): Number of attention heads
-        mlp_ratio (float): Multiplier for MLP hidden dimension. Default: 4.0
-        use_adaln_lora (bool): Whether to use AdaLN-LoRA modulation. Default: False
-        adaln_lora_dim (int): Hidden dimension for AdaLN-LoRA layers. Default: 256
-
-    The block applies the following sequence:
-    1. Self-attention with AdaLN modulation
-    2. Cross-attention with AdaLN modulation
-    3. MLP with AdaLN modulation
-
-    Each component uses skip connections and layer normalization.
-    """
-
-    def __init__(
-        self,
-        x_dim: int,
-        context_dim: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        device=None,
-        dtype=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.x_dim = x_dim
-        self.layer_norm_self_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
-        self.self_attn = Attention(x_dim, None, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations)
-
-        self.layer_norm_cross_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
-        self.cross_attn = Attention(
-            x_dim, context_dim, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations
-        )
-
-        self.layer_norm_mlp = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
-        self.mlp = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), device=device, dtype=dtype, operations=operations)
-
-        self.use_adaln_lora = use_adaln_lora
-        if self.use_adaln_lora:
-            self.adaln_modulation_self_attn = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
-                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
-            )
-            self.adaln_modulation_cross_attn = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
-                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
-            )
-            self.adaln_modulation_mlp = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
-                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
-            )
-        else:
-            self.adaln_modulation_self_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
-            self.adaln_modulation_cross_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
-            self.adaln_modulation_mlp = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
-
-    def forward(
-        self,
-        x_B_T_H_W_D: torch.Tensor,
-        emb_B_T_D: torch.Tensor,
-        crossattn_emb: torch.Tensor,
-        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
-        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
-        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if extra_per_block_pos_emb is not None:
-            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
-
-        if self.use_adaln_lora:
-            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = (
-                self.adaln_modulation_self_attn(emb_B_T_D) + adaln_lora_B_T_3D
-            ).chunk(3, dim=-1)
-            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = (
-                self.adaln_modulation_cross_attn(emb_B_T_D) + adaln_lora_B_T_3D
-            ).chunk(3, dim=-1)
-            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = (
-                self.adaln_modulation_mlp(emb_B_T_D) + adaln_lora_B_T_3D
-            ).chunk(3, dim=-1)
-        else:
-            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = self.adaln_modulation_self_attn(
-                emb_B_T_D
-            ).chunk(3, dim=-1)
-            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = self.adaln_modulation_cross_attn(
-                emb_B_T_D
-            ).chunk(3, dim=-1)
-            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = self.adaln_modulation_mlp(emb_B_T_D).chunk(3, dim=-1)
-
-        # Reshape tensors from (B, T, D) to (B, T, 1, 1, D) for broadcasting
-        shift_self_attn_B_T_1_1_D = rearrange(shift_self_attn_B_T_D, "b t d -> b t 1 1 d")
-        scale_self_attn_B_T_1_1_D = rearrange(scale_self_attn_B_T_D, "b t d -> b t 1 1 d")
-        gate_self_attn_B_T_1_1_D = rearrange(gate_self_attn_B_T_D, "b t d -> b t 1 1 d")
-
-        shift_cross_attn_B_T_1_1_D = rearrange(shift_cross_attn_B_T_D, "b t d -> b t 1 1 d")
-        scale_cross_attn_B_T_1_1_D = rearrange(scale_cross_attn_B_T_D, "b t d -> b t 1 1 d")
-        gate_cross_attn_B_T_1_1_D = rearrange(gate_cross_attn_B_T_D, "b t d -> b t 1 1 d")
-
-        shift_mlp_B_T_1_1_D = rearrange(shift_mlp_B_T_D, "b t d -> b t 1 1 d")
-        scale_mlp_B_T_1_1_D = rearrange(scale_mlp_B_T_D, "b t d -> b t 1 1 d")
-        gate_mlp_B_T_1_1_D = rearrange(gate_mlp_B_T_D, "b t d -> b t 1 1 d")
-
-        B, T, H, W, D = x_B_T_H_W_D.shape
-
-        def _fn(_x_B_T_H_W_D, _norm_layer, _scale_B_T_1_1_D, _shift_B_T_1_1_D):
-            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
-
-        normalized_x_B_T_H_W_D = _fn(
-            x_B_T_H_W_D,
-            self.layer_norm_self_attn,
-            scale_self_attn_B_T_1_1_D,
-            shift_self_attn_B_T_1_1_D,
-        )
-        result_B_T_H_W_D = rearrange(
-            self.self_attn(
-                # normalized_x_B_T_HW_D,
-                rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
-                None,
-                rope_emb=rope_emb_L_1_1_D,
-            ),
-            "b (t h w) d -> b t h w d",
-            t=T,
-            h=H,
-            w=W,
-        )
-        x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D * result_B_T_H_W_D
-
-        def _x_fn(
-            _x_B_T_H_W_D: torch.Tensor,
-            layer_norm_cross_attn: Callable,
-            _scale_cross_attn_B_T_1_1_D: torch.Tensor,
-            _shift_cross_attn_B_T_1_1_D: torch.Tensor,
-        ) -> torch.Tensor:
-            _normalized_x_B_T_H_W_D = _fn(
-                _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D
-            )
-            _result_B_T_H_W_D = rearrange(
-                self.cross_attn(
-                    rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
-                    crossattn_emb,
-                    rope_emb=rope_emb_L_1_1_D,
-                ),
-                "b (t h w) d -> b t h w d",
-                t=T,
-                h=H,
-                w=W,
-            )
-            return _result_B_T_H_W_D
-
-        result_B_T_H_W_D = _x_fn(
-            x_B_T_H_W_D,
-            self.layer_norm_cross_attn,
-            scale_cross_attn_B_T_1_1_D,
-            shift_cross_attn_B_T_1_1_D,
-        )
-        x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D
-
-        normalized_x_B_T_H_W_D = _fn(
-            x_B_T_H_W_D,
-            self.layer_norm_mlp,
-            scale_mlp_B_T_1_1_D,
-            shift_mlp_B_T_1_1_D,
-        )
-        result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D)
-        x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D * result_B_T_H_W_D
-        return x_B_T_H_W_D
-
-
-class MiniTrainDIT(nn.Module):
-    """
-    A clean impl of DIT that can load and  reproduce the training results of the original DIT model in~(cosmos 1)
-    A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
-
-    Args:
-        max_img_h (int): Maximum height of the input images.
-        max_img_w (int): Maximum width of the input images.
-        max_frames (int): Maximum number of frames in the video sequence.
-        in_channels (int): Number of input channels (e.g., RGB channels for color images).
-        out_channels (int): Number of output channels.
-        patch_spatial (tuple): Spatial resolution of patches for input processing.
-        patch_temporal (int): Temporal resolution of patches for input processing.
-        concat_padding_mask (bool): If True, includes a mask channel in the input to handle padding.
-        model_channels (int): Base number of channels used throughout the model.
-        num_blocks (int): Number of transformer blocks.
-        num_heads (int): Number of heads in the multi-head attention layers.
-        mlp_ratio (float): Expansion ratio for MLP blocks.
-        crossattn_emb_channels (int): Number of embedding channels for cross-attention.
-        pos_emb_cls (str): Type of positional embeddings.
-        pos_emb_learnable (bool): Whether positional embeddings are learnable.
-        pos_emb_interpolation (str): Method for interpolating positional embeddings.
-        min_fps (int): Minimum frames per second.
-        max_fps (int): Maximum frames per second.
-        use_adaln_lora (bool): Whether to use AdaLN-LoRA.
-        adaln_lora_dim (int): Dimension for AdaLN-LoRA.
-        rope_h_extrapolation_ratio (float): Height extrapolation ratio for RoPE.
-        rope_w_extrapolation_ratio (float): Width extrapolation ratio for RoPE.
-        rope_t_extrapolation_ratio (float): Temporal extrapolation ratio for RoPE.
-        extra_per_block_abs_pos_emb (bool): Whether to use extra per-block absolute positional embeddings.
-        extra_h_extrapolation_ratio (float): Height extrapolation ratio for extra embeddings.
-        extra_w_extrapolation_ratio (float): Width extrapolation ratio for extra embeddings.
-        extra_t_extrapolation_ratio (float): Temporal extrapolation ratio for extra embeddings.
-    """
-
-    def __init__(
-        self,
-        max_img_h: int,
-        max_img_w: int,
-        max_frames: int,
-        in_channels: int,
-        out_channels: int,
-        patch_spatial: int,  # tuple,
-        patch_temporal: int,
-        concat_padding_mask: bool = True,
-        # attention settings
-        model_channels: int = 768,
-        num_blocks: int = 10,
-        num_heads: int = 16,
-        mlp_ratio: float = 4.0,
-        # cross attention settings
-        crossattn_emb_channels: int = 1024,
-        # positional embedding settings
-        pos_emb_cls: str = "sincos",
-        pos_emb_learnable: bool = False,
-        pos_emb_interpolation: str = "crop",
-        min_fps: int = 1,
-        max_fps: int = 30,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        rope_h_extrapolation_ratio: float = 1.0,
-        rope_w_extrapolation_ratio: float = 1.0,
-        rope_t_extrapolation_ratio: float = 1.0,
-        extra_per_block_abs_pos_emb: bool = False,
-        extra_h_extrapolation_ratio: float = 1.0,
-        extra_w_extrapolation_ratio: float = 1.0,
-        extra_t_extrapolation_ratio: float = 1.0,
-        rope_enable_fps_modulation: bool = True,
-        image_model=None,
-        device=None,
-        dtype=None,
-        operations=None,
-    ) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.max_img_h = max_img_h
-        self.max_img_w = max_img_w
-        self.max_frames = max_frames
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.patch_spatial = patch_spatial
-        self.patch_temporal = patch_temporal
-        self.num_heads = num_heads
-        self.num_blocks = num_blocks
-        self.model_channels = model_channels
-        self.concat_padding_mask = concat_padding_mask
-        # positional embedding settings
-        self.pos_emb_cls = pos_emb_cls
-        self.pos_emb_learnable = pos_emb_learnable
-        self.pos_emb_interpolation = pos_emb_interpolation
-        self.min_fps = min_fps
-        self.max_fps = max_fps
-        self.rope_h_extrapolation_ratio = rope_h_extrapolation_ratio
-        self.rope_w_extrapolation_ratio = rope_w_extrapolation_ratio
-        self.rope_t_extrapolation_ratio = rope_t_extrapolation_ratio
-        self.extra_per_block_abs_pos_emb = extra_per_block_abs_pos_emb
-        self.extra_h_extrapolation_ratio = extra_h_extrapolation_ratio
-        self.extra_w_extrapolation_ratio = extra_w_extrapolation_ratio
-        self.extra_t_extrapolation_ratio = extra_t_extrapolation_ratio
-        self.rope_enable_fps_modulation = rope_enable_fps_modulation
-
-        self.build_pos_embed(device=device, dtype=dtype)
-        self.use_adaln_lora = use_adaln_lora
-        self.adaln_lora_dim = adaln_lora_dim
-        self.t_embedder = nn.Sequential(
-            Timesteps(model_channels),
-            TimestepEmbedding(model_channels, model_channels, use_adaln_lora=use_adaln_lora, device=device, dtype=dtype, operations=operations,),
-        )
-
-        in_channels = in_channels + 1 if concat_padding_mask else in_channels
-        self.x_embedder = PatchEmbed(
-            spatial_patch_size=patch_spatial,
-            temporal_patch_size=patch_temporal,
-            in_channels=in_channels,
-            out_channels=model_channels,
-            device=device, dtype=dtype, operations=operations,
-        )
-
-        self.blocks = nn.ModuleList(
-            [
-                Block(
-                    x_dim=model_channels,
-                    context_dim=crossattn_emb_channels,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    use_adaln_lora=use_adaln_lora,
-                    adaln_lora_dim=adaln_lora_dim,
-                    device=device, dtype=dtype, operations=operations,
-                )
-                for _ in range(num_blocks)
-            ]
-        )
-
-        self.final_layer = FinalLayer(
-            hidden_size=self.model_channels,
-            spatial_patch_size=self.patch_spatial,
-            temporal_patch_size=self.patch_temporal,
-            out_channels=self.out_channels,
-            use_adaln_lora=self.use_adaln_lora,
-            adaln_lora_dim=self.adaln_lora_dim,
-            device=device, dtype=dtype, operations=operations,
-        )
-
-        self.t_embedding_norm = operations.RMSNorm(model_channels, eps=1e-6, device=device, dtype=dtype)
-
-    def build_pos_embed(self, device=None, dtype=None) -> None:
-        if self.pos_emb_cls == "rope3d":
-            cls_type = VideoRopePosition3DEmb
-        else:
-            raise ValueError(f"Unknown pos_emb_cls {self.pos_emb_cls}")
-
-        logging.debug(f"Building positional embedding with {self.pos_emb_cls} class, impl {cls_type}")
-        kwargs = dict(
-            model_channels=self.model_channels,
-            len_h=self.max_img_h // self.patch_spatial,
-            len_w=self.max_img_w // self.patch_spatial,
-            len_t=self.max_frames // self.patch_temporal,
-            max_fps=self.max_fps,
-            min_fps=self.min_fps,
-            is_learnable=self.pos_emb_learnable,
-            interpolation=self.pos_emb_interpolation,
-            head_dim=self.model_channels // self.num_heads,
-            h_extrapolation_ratio=self.rope_h_extrapolation_ratio,
-            w_extrapolation_ratio=self.rope_w_extrapolation_ratio,
-            t_extrapolation_ratio=self.rope_t_extrapolation_ratio,
-            enable_fps_modulation=self.rope_enable_fps_modulation,
-            device=device,
-        )
-        self.pos_embedder = cls_type(
-            **kwargs,  # type: ignore
-        )
-
-        if self.extra_per_block_abs_pos_emb:
-            kwargs["h_extrapolation_ratio"] = self.extra_h_extrapolation_ratio
-            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
-            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
-            kwargs["device"] = device
-            kwargs["dtype"] = dtype
-            self.extra_pos_embedder = LearnablePosEmbAxis(
-                **kwargs,  # type: ignore
-            )
-
-    def prepare_embedded_sequence(
-        self,
-        x_B_C_T_H_W: torch.Tensor,
-        fps: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
-
-        Args:
-            x_B_C_T_H_W (torch.Tensor): video
-            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
-                                    If None, a default value (`self.base_fps`) will be used.
-            padding_mask (Optional[torch.Tensor]): current it is not used
-
-        Returns:
-            Tuple[torch.Tensor, Optional[torch.Tensor]]:
-                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
-                - An optional positional embedding tensor, returned only if the positional embedding class
-                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
-
-        Notes:
-            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
-            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
-            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
-                the `self.pos_embedder` with the shape [T, H, W].
-            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the
-            `self.pos_embedder` with the fps tensor.
-            - Otherwise, the positional embeddings are generated without considering fps.
-        """
-        if self.concat_padding_mask:
-            if padding_mask is None:
-                padding_mask = torch.zeros(x_B_C_T_H_W.shape[0], 1, x_B_C_T_H_W.shape[3], x_B_C_T_H_W.shape[4], dtype=x_B_C_T_H_W.dtype, device=x_B_C_T_H_W.device)
-            else:
-                padding_mask = transforms.functional.resize(
-                    padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
-                )
-            x_B_C_T_H_W = torch.cat(
-                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
-            )
-        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
-
-        if self.extra_per_block_abs_pos_emb:
-            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
-        else:
-            extra_pos_emb = None
-
-        if "rope" in self.pos_emb_cls.lower():
-            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device), extra_pos_emb
-        x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
-
-        return x_B_T_H_W_D, None, extra_pos_emb
-
-    def unpatchify(self, x_B_T_H_W_M: torch.Tensor) -> torch.Tensor:
-        x_B_C_Tt_Hp_Wp = rearrange(
-            x_B_T_H_W_M,
-            "B T H W (p1 p2 t C) -> B C (T t) (H p1) (W p2)",
-            p1=self.patch_spatial,
-            p2=self.patch_spatial,
-            t=self.patch_temporal,
-        )
-        return x_B_C_Tt_Hp_Wp
-
-    def forward(self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        context: torch.Tensor,
-        fps: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {}))
-        ).execute(x, timesteps, context, fps, padding_mask, **kwargs)
-
-    def _forward(
-        self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        context: torch.Tensor,
-        fps: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ):
-        x_B_C_T_H_W = x
-        timesteps_B_T = timesteps
-        crossattn_emb = context
-        """
-        Args:
-            x: (B, C, T, H, W) tensor of spatial-temp inputs
-            timesteps: (B, ) tensor of timesteps
-            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
-        """
-        x_B_T_H_W_D, rope_emb_L_1_1_D, extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = self.prepare_embedded_sequence(
-            x_B_C_T_H_W,
-            fps=fps,
-            padding_mask=padding_mask,
-        )
-
-        if timesteps_B_T.ndim == 1:
-            timesteps_B_T = timesteps_B_T.unsqueeze(1)
-        t_embedding_B_T_D, adaln_lora_B_T_3D = self.t_embedder[1](self.t_embedder[0](timesteps_B_T).to(x_B_T_H_W_D.dtype))
-        t_embedding_B_T_D = self.t_embedding_norm(t_embedding_B_T_D)
-
-        # for logging purpose
-        affline_scale_log_info = {}
-        affline_scale_log_info["t_embedding_B_T_D"] = t_embedding_B_T_D.detach()
-        self.affline_scale_log_info = affline_scale_log_info
-        self.affline_emb = t_embedding_B_T_D
-        self.crossattn_emb = crossattn_emb
-
-        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-            assert (
-                x_B_T_H_W_D.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
-            ), f"{x_B_T_H_W_D.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape}"
-
-        block_kwargs = {
-            "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0),
-            "adaln_lora_B_T_3D": adaln_lora_B_T_3D,
-            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
-        }
-        for block in self.blocks:
-            x_B_T_H_W_D = block(
-                x_B_T_H_W_D,
-                t_embedding_B_T_D,
-                crossattn_emb,
-                **block_kwargs,
-            )
-
-        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
-        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)
-        return x_B_C_Tt_Hp_Wp
--- a/comfy/ldm/cosmos/vae.py
+++ b/comfy/ldm/cosmos/vae.py
@@ -18,7 +18,6 @@ import logging
 import torch
 from torch import nn
 from enum import Enum
-import math

 from .cosmos_tokenizer.layers3d import (
    EncoderFactorized,
@@ -90,8 +89,8 @@ class CausalContinuousVideoTokenizer(nn.Module):
        self.distribution = IdentityDistribution()  # ContinuousFormulation[formulation_name].value()

        num_parameters = sum(param.numel() for param in self.parameters())
-        logging.debug(f"model={self.name}, num_parameters={num_parameters:,}")
-        logging.debug(
+        logging.info(f"model={self.name}, num_parameters={num_parameters:,}")
+        logging.info(
            f"z_channels={z_channels}, latent_channels={self.latent_channels}."
        )

@@ -106,23 +105,17 @@ class CausalContinuousVideoTokenizer(nn.Module):
        z, posteriors = self.distribution(moments)
        latent_ch = z.shape[1]
        latent_t = z.shape[2]
-        in_dtype = z.dtype
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        dtype = z.dtype
+        mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device)
+        std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device)
        return ((z - mean) / std) * self.sigma_data

    def decode(self, z):
        in_dtype = z.dtype
        latent_ch = z.shape[1]
        latent_t = z.shape[2]
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)

        z = z / self.sigma_data
        z = z * std + mean
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@@ -121,11 +121,6 @@ class ControlNetFlux(Flux):
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")

-        if y is None:
-            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-        else:
-            y = y[:, :self.params.vec_in_dim]
-
        # running on sequences img
        img = self.img_in(img)

@@ -179,7 +174,7 @@ class ControlNetFlux(Flux):
            out["output"] = out_output[:self.main_model_single]
        return out

-    def forward(self, x, timesteps, context, y=None, guidance=None, hint=None, **kwargs):
+    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
        patch_size = 2
        if self.latent_input:
            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -105,9 +105,7 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)

        return (
            ModulationOut(*out[:3]),
@@ -115,20 +113,6 @@ class Modulation(nn.Module):
        )


-def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
-    if modulation_dims is None:
-        if m_add is not None:
-            return torch.addcmul(m_add, tensor, m_mult)
-        else:
-            return tensor * m_mult
-    else:
-        for d in modulation_dims:
-            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
-            if m_add is not None:
-                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
-        return tensor
-
-
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
@@ -159,20 +143,20 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
@@ -195,12 +179,12 @@ class DoubleStreamBlock(nn.Module):
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)

        # calculate the txt bloks
-        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
-        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)

        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@@ -244,9 +228,10 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None) -> Tensor:
        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
@@ -255,7 +240,7 @@ class SingleStreamBlock(nn.Module):
        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += apply_mod(output, mod.gate, None, modulation_dims)
+        x += mod.gate * output
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
@@ -268,11 +253,8 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))

-    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
-        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -5,16 +5,8 @@ from torch import Tensor
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management

-
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
-    q_shape = q.shape
-    k_shape = k.shape
-
-    if pe is not None:
-        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
-        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
-        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
-        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
+    q, k = apply_rope(q, k, pe)

    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
@@ -23,7 +15,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:

 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
        device = torch.device("cpu")
    else:
        device = pos.device
@@ -37,8 +29,8 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:


 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -6,7 +6,6 @@ import torch
 from torch import Tensor, nn
 from einops import rearrange, repeat
 import comfy.ldm.common_dit
-import comfy.patcher_extension

 from .layers import (
    DoubleStreamBlock,
@@ -102,10 +101,6 @@ class Flux(nn.Module):
        transformer_options={},
        attn_mask: Tensor = None,
    ) -> Tensor:
-
-        if y is None:
-            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -114,17 +109,15 @@ class Flux(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
        if self.params.guidance_embed:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
        txt = self.txt_in(txt)

-        if img_ids is not None:
-            ids = torch.cat((txt_ids, img_ids), dim=1)
-            pe = self.pe_embedder(ids)
-        else:
-            pe = None
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)

        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
@@ -158,10 +151,7 @@ class Flux(nn.Module):
                if i < len(control_i):
                    add = control_i[i]
                    if add is not None:
-                        img[:, :add.shape[1]] += add
-
-        if img.dtype == torch.float16:
-            img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504)
+                        img += add

        img = torch.cat((txt, img), 1)

@@ -189,78 +179,27 @@ class Flux(nn.Module):
                if i < len(control_o):
                    add = control_o[i]
                    if add is not None:
-                        img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add
+                        img[:, txt.shape[1] :, ...] += add

        img = img[:, txt.shape[1] :, ...]

        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def process_img(self, x, index=0, h_offset=0, w_offset=0):
+    def forward(self, x, timestep, context, y, guidance, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))

        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
-
-        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
-        w_offset = ((w_offset + (patch_size // 2)) // patch_size)
-
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
-        return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-    def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, guidance, ref_latents, control, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
-        bs, c, h_orig, w_orig = x.shape
-        patch_size = self.patch_size
-
-        h_len = ((h_orig + (patch_size // 2)) // patch_size)
-        w_len = ((w_orig + (patch_size // 2)) // patch_size)
-        img, img_ids = self.process_img(x)
-        img_tokens = img.shape[1]
-        if ref_latents is not None:
-            h = 0
-            w = 0
-            index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", "offset")
-            for ref in ref_latents:
-                if ref_latents_method == "index":
-                    index += 1
-                    h_offset = 0
-                    w_offset = 0
-                elif ref_latents_method == "uso":
-                    index = 0
-                    h_offset = h_len * patch_size + h
-                    w_offset = w_len * patch_size + w
-                    h += ref.shape[-2]
-                    w += ref.shape[-1]
-                else:
-                    index = 1
-                    h_offset = 0
-                    w_offset = 0
-                    if ref.shape[-2] + h > ref.shape[-1] + w:
-                        w_offset = w
-                    else:
-                        h_offset = h
-                    h = max(h, ref.shape[-2] + h_offset)
-                    w = max(w, ref.shape[-1] + w_offset)
-
-                kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
-                img = torch.cat([img, kontext], dim=1)
-                img_ids = torch.cat([img_ids, kontext_ids], dim=1)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@@ -13,6 +13,7 @@ from comfy.ldm.modules.attention import optimized_attention
 from .layers import (
    FeedForward,
    PatchEmbed,
+    RMSNorm,
    TimestepEmbedder,
 )

@@ -89,10 +90,10 @@ class AsymmetricAttention(nn.Module):

        # Query and key normalization for stability.
        assert qk_norm
-        self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)

        # Output layers. y features go back down from dim_x -> dim_y.
        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
--- a/comfy/ldm/genmo/joint_model/layers.py
+++ b/comfy/ldm/genmo/joint_model/layers.py
@@ -151,3 +151,14 @@ class PatchEmbed(nn.Module):

        x = self.norm(x)
        return x
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
+        self.register_parameter("bias", None)
+
+    def forward(self, x):
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@@ -1,819 +0,0 @@
-from typing import Optional, Tuple, List
-
-import torch
-import torch.nn as nn
-import einops
-from einops import repeat
-
-from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
-import torch.nn.functional as F
-
-from comfy.ldm.flux.math import apply_rope, rope
-from comfy.ldm.flux.layers import LastLayer
-
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.model_management
-import comfy.patcher_extension
-import comfy.ldm.common_dit
-
-
-# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
-class EmbedND(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(2)
-
-
-class PatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size=2,
-        in_channels=4,
-        out_channels=1024,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.out_channels = out_channels
-        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
-
-    def forward(self, latent):
-        latent = self.proj(latent)
-        return latent
-
-
-class PooledEmbed(nn.Module):
-    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, pooled_embed):
-        return self.pooled_embedder(pooled_embed)
-
-
-class TimestepEmbed(nn.Module):
-    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, timesteps, wdtype):
-        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
-        t_emb = self.timestep_embedder(t_emb)
-        return t_emb
-
-
-def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
-    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
-
-
-class HiDreamAttnProcessor_flashattn:
-    """Attention processor used typically in processing the SD3-like self-attention projections."""
-
-    def __call__(
-        self,
-        attn,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-        *args,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        dtype = image_tokens.dtype
-        batch_size = image_tokens.shape[0]
-
-        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
-        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
-        value_i = attn.to_v(image_tokens)
-
-        inner_dim = key_i.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
-        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
-        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
-        if image_tokens_masks is not None:
-            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
-
-        if not attn.single:
-            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
-            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
-            value_t = attn.to_v_t(text_tokens)
-
-            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
-            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
-            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
-
-            num_image_tokens = query_i.shape[1]
-            num_text_tokens = query_t.shape[1]
-            query = torch.cat([query_i, query_t], dim=1)
-            key = torch.cat([key_i, key_t], dim=1)
-            value = torch.cat([value_i, value_t], dim=1)
-        else:
-            query = query_i
-            key = key_i
-            value = value_i
-
-        if query.shape[-1] == rope.shape[-3] * 2:
-            query, key = apply_rope(query, key, rope)
-        else:
-            query_1, query_2 = query.chunk(2, dim=-1)
-            key_1, key_2 = key.chunk(2, dim=-1)
-            query_1, key_1 = apply_rope(query_1, key_1, rope)
-            query = torch.cat([query_1, query_2], dim=-1)
-            key = torch.cat([key_1, key_2], dim=-1)
-
-        hidden_states = attention(query, key, value)
-
-        if not attn.single:
-            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
-            hidden_states_i = attn.to_out(hidden_states_i)
-            hidden_states_t = attn.to_out_t(hidden_states_t)
-            return hidden_states_i, hidden_states_t
-        else:
-            hidden_states = attn.to_out(hidden_states)
-            return hidden_states
-
-class HiDreamAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        scale_qk: bool = True,
-        eps: float = 1e-5,
-        processor = None,
-        out_dim: int = None,
-        single: bool = False,
-        dtype=None, device=None, operations=None
-    ):
-        # super(Attention, self).__init__()
-        super().__init__()
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-        self.out_dim = out_dim if out_dim is not None else query_dim
-
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.sliceable_head_dim = heads
-        self.single = single
-
-        linear_cls = operations.Linear
-        self.linear_cls = linear_cls
-        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        if not single:
-            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        self.processor = processor
-
-    def forward(
-        self,
-        norm_image_tokens: torch.FloatTensor,
-        image_tokens_masks: torch.FloatTensor = None,
-        norm_text_tokens: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.Tensor:
-        return self.processor(
-            self,
-            image_tokens = norm_image_tokens,
-            image_tokens_masks = image_tokens_masks,
-            text_tokens = norm_text_tokens,
-            rope = rope,
-        )
-
-
-class FeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-
-        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
-        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MoEGate(nn.Module):
-    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.top_k = num_activated_experts
-        self.n_routed_experts = num_routed_experts
-
-        self.scoring_func = 'softmax'
-        self.alpha = aux_loss_alpha
-        self.seq_aux = False
-
-        # topk selection algorithm
-        self.norm_topk_prob = False
-        self.gating_dim = embed_dim
-        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        pass
-        # import torch.nn.init  as init
-        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-
-    def forward(self, hidden_states):
-        bsz, seq_len, h = hidden_states.shape
-
-        ### compute gating score
-        hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
-        if self.scoring_func == 'softmax':
-            scores = logits.softmax(dim=-1)
-        else:
-            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
-
-        ### select top-k experts
-        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
-
-        ### norm gate to sum 1
-        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
-            topk_weight = topk_weight / denominator
-
-        aux_loss = None
-        return topk_idx, topk_weight, aux_loss
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MOEFeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        num_routed_experts: int,
-        num_activated_experts: int,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
-        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
-        self.gate = MoEGate(
-            embed_dim = dim,
-            num_routed_experts = num_routed_experts,
-            num_activated_experts = num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.num_activated_experts = num_activated_experts
-
-    def forward(self, x):
-        wtype = x.dtype
-        identity = x
-        orig_shape = x.shape
-        topk_idx, topk_weight, aux_loss = self.gate(x)
-        x = x.view(-1, x.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
-        if True:  # self.training: # TODO: check which branch performs faster
-            x = x.repeat_interleave(self.num_activated_experts, dim=0)
-            y = torch.empty_like(x, dtype=wtype)
-            for i, expert in enumerate(self.experts):
-                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
-            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-            y =  y.view(*orig_shape).to(dtype=wtype)
-            #y = AddAuxiliaryLoss.apply(y, aux_loss)
-        else:
-            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
-        y = y + self.shared_experts(identity)
-        return y
-
-    @torch.no_grad()
-    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
-        expert_cache = torch.zeros_like(x)
-        idxs = flat_expert_indices.argsort()
-        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
-        token_idxs = idxs // self.num_activated_experts
-        for i, end_idx in enumerate(tokens_per_expert):
-            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
-            if start_idx == end_idx:
-                continue
-            expert = self.experts[i]
-            exp_token_idx = token_idxs[start_idx:end_idx]
-            expert_tokens = x[exp_token_idx]
-            expert_out = expert(expert_tokens)
-            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-
-            # for fp16 and other dtype
-            expert_cache = expert_cache.to(expert_out.dtype)
-            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
-        return expert_cache
-
-
-class TextProjection(nn.Module):
-    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
-
-    def forward(self, caption):
-        hidden_states = self.linear(caption)
-        return hidden_states
-
-
-class BlockType:
-    TransformerBlock = 1
-    SingleTransformerBlock = 2
-
-
-class HiDreamImageSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
-        )
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = True,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        attn_output_i = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            rope = rope,
-        )
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
-        image_tokens = ff_output_i + image_tokens
-        return image_tokens
-
-
-class HiDreamImageTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
-        )
-        # nn.init.zeros_(self.adaLN_modulation[1].weight)
-        # nn.init.zeros_(self.adaLN_modulation[1].bias)
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = False,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
-        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
-        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
-
-        attn_output_i, attn_output_t = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            norm_text_tokens,
-            rope = rope,
-        )
-
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-        text_tokens = gate_msa_t * attn_output_t + text_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
-
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
-        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
-        image_tokens = ff_output_i + image_tokens
-        text_tokens = ff_output_t + text_tokens
-        return image_tokens, text_tokens
-
-
-class HiDreamImageBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        block_type: BlockType = BlockType.TransformerBlock,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        block_classes = {
-            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
-            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
-        }
-        self.block = block_classes[block_type](
-            dim,
-            num_attention_heads,
-            attention_head_dim,
-            num_routed_experts,
-            num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        return self.block(
-            image_tokens,
-            image_tokens_masks,
-            text_tokens,
-            adaln_input,
-            rope,
-        )
-
-
-class HiDreamImageTransformer2DModel(nn.Module):
-    def __init__(
-        self,
-        patch_size: Optional[int] = None,
-        in_channels: int = 64,
-        out_channels: Optional[int] = None,
-        num_layers: int = 16,
-        num_single_layers: int = 32,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 20,
-        caption_channels: List[int] = None,
-        text_emb_dim: int = 2048,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        axes_dims_rope: Tuple[int, int] = (32, 32),
-        max_resolution: Tuple[int, int] = (128, 128),
-        llama_layers: List[int] = None,
-        image_model=None,
-        dtype=None, device=None, operations=None
-    ):
-        self.patch_size = patch_size
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        self.num_layers = num_layers
-        self.num_single_layers = num_single_layers
-
-        self.gradient_checkpointing = False
-
-        super().__init__()
-        self.dtype = dtype
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = self.num_attention_heads * self.attention_head_dim
-        self.llama_layers = llama_layers
-
-        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.x_embedder = PatchEmbed(
-            patch_size = patch_size,
-            in_channels = in_channels,
-            out_channels = self.inner_dim,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
-
-        self.double_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.TransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_layers)
-            ]
-        )
-
-        self.single_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.SingleTransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_single_layers)
-            ]
-        )
-
-        self.final_layer = LastLayer(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
-        caption_projection = []
-        for caption_channel in caption_channels:
-            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
-        self.caption_projection = nn.ModuleList(caption_projection)
-        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
-
-    def expand_timesteps(self, timesteps, batch_size, device):
-        if not torch.is_tensor(timesteps):
-            is_mps = device.type == "mps"
-            if isinstance(timesteps, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(batch_size)
-        return timesteps
-
-    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
-        x_arr = []
-        for i, img_size in enumerate(img_sizes):
-            pH, pW = img_size
-            x_arr.append(
-                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
-                    p1=self.patch_size, p2=self.patch_size)
-            )
-        x = torch.cat(x_arr, dim=0)
-        return x
-
-    def patchify(self, x, max_seq, img_sizes=None):
-        pz2 = self.patch_size * self.patch_size
-        if isinstance(x, torch.Tensor):
-            B = x.shape[0]
-            device = x.device
-            dtype = x.dtype
-        else:
-            B = len(x)
-            device = x[0].device
-            dtype = x[0].dtype
-        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
-
-        if img_sizes is not None:
-            for i, img_size in enumerate(img_sizes):
-                x_masks[i, 0:img_size[0] * img_size[1]] = 1
-            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
-        elif isinstance(x, torch.Tensor):
-            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
-            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
-            img_sizes = [[pH, pW]] * B
-            x_masks = None
-        else:
-            raise NotImplementedError
-        return x, x_masks, img_sizes
-
-    def forward(self,
-        x: torch.Tensor,
-        t: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        context: Optional[torch.Tensor] = None,
-        encoder_hidden_states_llama3=None,
-        image_cond=None,
-        control = None,
-        transformer_options = {},
-    ):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, t, y, context, encoder_hidden_states_llama3, image_cond, control, transformer_options)
-
-    def _forward(
-        self,
-        x: torch.Tensor,
-        t: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        context: Optional[torch.Tensor] = None,
-        encoder_hidden_states_llama3=None,
-        image_cond=None,
-        control = None,
-        transformer_options = {},
-    ) -> torch.Tensor:
-        bs, c, h, w = x.shape
-        if image_cond is not None:
-            x = torch.cat([x, image_cond], dim=-1)
-        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-        timesteps = t
-        pooled_embeds = y
-        T5_encoder_hidden_states = context
-
-        img_sizes = None
-
-        # spatial forward
-        batch_size = hidden_states.shape[0]
-        hidden_states_type = hidden_states.dtype
-
-        # 0. time
-        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
-        timesteps = self.t_embedder(timesteps, hidden_states_type)
-        p_embedder = self.p_embedder(pooled_embeds)
-        adaln_input = timesteps + p_embedder
-
-        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
-        if image_tokens_masks is None:
-            pH, pW = img_sizes[0]
-            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
-            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
-            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
-            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
-        hidden_states = self.x_embedder(hidden_states)
-
-        # T5_encoder_hidden_states = encoder_hidden_states[0]
-        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
-        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
-
-        if self.caption_projection is not None:
-            new_encoder_hidden_states = []
-            for i, enc_hidden_state in enumerate(encoder_hidden_states):
-                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
-                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
-                new_encoder_hidden_states.append(enc_hidden_state)
-            encoder_hidden_states = new_encoder_hidden_states
-            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
-            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-            encoder_hidden_states.append(T5_encoder_hidden_states)
-
-        txt_ids = torch.zeros(
-            batch_size,
-            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
-            3,
-            device=img_ids.device, dtype=img_ids.dtype
-        )
-        ids = torch.cat((img_ids, txt_ids), dim=1)
-        rope = self.pe_embedder(ids)
-
-        # 2. Blocks
-        block_id = 0
-        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
-        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
-        for bid, block in enumerate(self.double_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states, initial_encoder_hidden_states = block(
-                image_tokens = hidden_states,
-                image_tokens_masks = image_tokens_masks,
-                text_tokens = cur_encoder_hidden_states,
-                adaln_input = adaln_input,
-                rope = rope,
-            )
-            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
-            block_id += 1
-
-        image_tokens_seq_len = hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
-        hidden_states_seq_len = hidden_states.shape[1]
-        if image_tokens_masks is not None:
-            encoder_attention_mask_ones = torch.ones(
-                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
-                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
-            )
-            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
-
-        for bid, block in enumerate(self.single_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states = block(
-                image_tokens=hidden_states,
-                image_tokens_masks=image_tokens_masks,
-                text_tokens=None,
-                adaln_input=adaln_input,
-                rope=rope,
-            )
-            hidden_states = hidden_states[:, :hidden_states_seq_len]
-            block_id += 1
-
-        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
-        output = self.final_layer(hidden_states, adaln_input)
-        output = self.unpatchify(output, img_sizes)
-        return -output[:, :, :h, :w]
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@@ -1,143 +0,0 @@
-import torch
-from torch import nn
-from comfy.ldm.flux.layers import (
-    DoubleStreamBlock,
-    LastLayer,
-    MLPEmbedder,
-    SingleStreamBlock,
-    timestep_embedding,
-)
-import comfy.patcher_extension
-
-
-class Hunyuan3Dv2(nn.Module):
-    def __init__(
-        self,
-        in_channels=64,
-        context_in_dim=1536,
-        hidden_size=1024,
-        mlp_ratio=4.0,
-        num_heads=16,
-        depth=16,
-        depth_single_blocks=32,
-        qkv_bias=True,
-        guidance_embed=False,
-        image_model=None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.dtype = dtype
-
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-
-        self.max_period = 1000  # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
-        self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
-        )
-        self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, guidance, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
-        x = x.movedim(-1, -2)
-        timestep = 1.0 - timestep
-        txt = context
-        img = self.latent_in(x)
-
-        vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
-        if self.guidance_in is not None:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
-
-        txt = self.cond_in(txt)
-        pe = None
-        attn_mask = None
-
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"],
-                                                   txt=args["txt"],
-                                                   vec=args["vec"],
-                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": img,
-                                                           "txt": txt,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                txt = out["txt"]
-                img = out["img"]
-            else:
-                img, txt = block(img=img,
-                                 txt=txt,
-                                 vec=vec,
-                                 pe=pe,
-                                 attn_mask=attn_mask)
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if ("single_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"],
-                                       vec=args["vec"],
-                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("single_block", i)]({"img": img,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                img = out["img"]
-            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
-
-        img = img[:, txt.shape[1]:, ...]
-        img = self.final_layer(img, vec)
-        return img.movedim(-2, -1) * (-1.0)
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@@ -1,587 +0,0 @@
-# Original: https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/models/autoencoders/model.py
-# Since the header on their VAE source file was a bit confusing we asked for permission to use this code from tencent under the GPL license used in ComfyUI.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-from typing import Union, Tuple, List, Callable, Optional
-
-import numpy as np
-from einops import repeat, rearrange
-from tqdm import tqdm
-import logging
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-def generate_dense_grid_points(
-    bbox_min: np.ndarray,
-    bbox_max: np.ndarray,
-    octree_resolution: int,
-    indexing: str = "ij",
-):
-    length = bbox_max - bbox_min
-    num_cells = octree_resolution
-
-    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
-    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
-    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
-    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
-    xyz = np.stack((xs, ys, zs), axis=-1)
-    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
-
-    return xyz, grid_size, length
-
-
-class VanillaVolumeDecoder:
-    @torch.no_grad()
-    def __call__(
-        self,
-        latents: torch.FloatTensor,
-        geo_decoder: Callable,
-        bounds: Union[Tuple[float], List[float], float] = 1.01,
-        num_chunks: int = 10000,
-        octree_resolution: int = None,
-        enable_pbar: bool = True,
-        **kwargs,
-    ):
-        device = latents.device
-        dtype = latents.dtype
-        batch_size = latents.shape[0]
-
-        # 1. generate query points
-        if isinstance(bounds, float):
-            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
-
-        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
-        xyz_samples, grid_size, length = generate_dense_grid_points(
-            bbox_min=bbox_min,
-            bbox_max=bbox_max,
-            octree_resolution=octree_resolution,
-            indexing="ij"
-        )
-        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
-
-        # 2. latents to 3d volume
-        batch_logits = []
-        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
-                          disable=not enable_pbar):
-            chunk_queries = xyz_samples[start: start + num_chunks, :]
-            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
-            logits = geo_decoder(queries=chunk_queries, latents=latents)
-            batch_logits.append(logits)
-
-        grid_logits = torch.cat(batch_logits, dim=1)
-        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
-
-        return grid_logits
-
-
-class FourierEmbedder(nn.Module):
-    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
-    each feature dimension of `x[..., i]` into:
-        [
-            sin(x[..., i]),
-            sin(f_1*x[..., i]),
-            sin(f_2*x[..., i]),
-            ...
-            sin(f_N * x[..., i]),
-            cos(x[..., i]),
-            cos(f_1*x[..., i]),
-            cos(f_2*x[..., i]),
-            ...
-            cos(f_N * x[..., i]),
-            x[..., i]     # only present if include_input is True.
-        ], here f_i is the frequency.
-
-    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
-    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
-    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
-
-    Args:
-        num_freqs (int): the number of frequencies, default is 6;
-        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
-        input_dim (int): the input dimension, default is 3;
-        include_input (bool): include the input tensor or not, default is True.
-
-    Attributes:
-        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
-
-        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
-            otherwise, it is input_dim * num_freqs * 2.
-
-    """
-
-    def __init__(self,
-                 num_freqs: int = 6,
-                 logspace: bool = True,
-                 input_dim: int = 3,
-                 include_input: bool = True,
-                 include_pi: bool = True) -> None:
-
-        """The initialization"""
-
-        super().__init__()
-
-        if logspace:
-            frequencies = 2.0 ** torch.arange(
-                num_freqs,
-                dtype=torch.float32
-            )
-        else:
-            frequencies = torch.linspace(
-                1.0,
-                2.0 ** (num_freqs - 1),
-                num_freqs,
-                dtype=torch.float32
-            )
-
-        if include_pi:
-            frequencies *= torch.pi
-
-        self.register_buffer("frequencies", frequencies, persistent=False)
-        self.include_input = include_input
-        self.num_freqs = num_freqs
-
-        self.out_dim = self.get_dims(input_dim)
-
-    def get_dims(self, input_dim):
-        temp = 1 if self.include_input or self.num_freqs == 0 else 0
-        out_dim = input_dim * (self.num_freqs * 2 + temp)
-
-        return out_dim
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """ Forward process.
-
-        Args:
-            x: tensor of shape [..., dim]
-
-        Returns:
-            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
-                where temp is 1 if include_input is True and 0 otherwise.
-        """
-
-        if self.num_freqs > 0:
-            embed = (x[..., None].contiguous() * self.frequencies.to(device=x.device, dtype=x.dtype)).view(*x.shape[:-1], -1)
-            if self.include_input:
-                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
-            else:
-                return torch.cat((embed.sin(), embed.cos()), dim=-1)
-        else:
-            return x
-
-
-class CrossAttentionProcessor:
-    def __call__(self, attn, q, k, v):
-        out = comfy.ops.scaled_dot_product_attention(q, k, v)
-        return out
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-        'survival rate' as the argument.
-
-        """
-        if self.drop_prob == 0. or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-        if keep_prob > 0.0 and self.scale_by_keep:
-            random_tensor.div_(keep_prob)
-        return x * random_tensor
-
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
-
-
-class MLP(nn.Module):
-    def __init__(
-        self, *,
-        width: int,
-        expand_ratio: int = 4,
-        output_width: int = None,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.c_fc = ops.Linear(width, width * expand_ratio)
-        self.c_proj = ops.Linear(width * expand_ratio, output_width if output_width is not None else width)
-        self.gelu = nn.GELU()
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
-
-
-class QKVMultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-        self.attn_processor = CrossAttentionProcessor()
-
-    def forward(self, q, kv):
-        _, n_ctx, _ = q.shape
-        bs, n_data, width = kv.shape
-        attn_ch = width // self.heads // 2
-        q = q.view(bs, n_ctx, self.heads, -1)
-        kv = kv.view(bs, n_data, self.heads, -1)
-        k, v = torch.split(kv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = self.attn_processor(self, q, k, v)
-        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        data_width: Optional[int] = None,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        kv_cache: bool = False,
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.data_width = width if data_width is None else data_width
-        self.c_q = ops.Linear(width, width, bias=qkv_bias)
-        self.c_kv = ops.Linear(self.data_width, width * 2, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadCrossAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.kv_cache = kv_cache
-        self.data = None
-
-    def forward(self, x, data):
-        x = self.c_q(x)
-        if self.kv_cache:
-            if self.data is None:
-                self.data = self.c_kv(data)
-                logging.info('Save kv cache,this should be called only once for one mesh')
-            data = self.data
-        else:
-            data = self.c_kv(data)
-        x = self.attention(x, data)
-        x = self.c_proj(x)
-        return x
-
-
-class ResidualCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        data_width: Optional[int] = None,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False
-    ):
-        super().__init__()
-
-        if data_width is None:
-            data_width = width
-
-        self.attn = MultiheadCrossAttention(
-            width=width,
-            heads=heads,
-            data_width=data_width,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
-        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
-
-    def forward(self, x: torch.Tensor, data: torch.Tensor):
-        x = x + self.attn(self.ln_1(x), self.ln_2(data))
-        x = x + self.mlp(self.ln_3(x))
-        return x
-
-
-class QKVMultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-    def forward(self, qkv):
-        bs, n_ctx, width = qkv.shape
-        attn_ch = width // self.heads // 3
-        qkv = qkv.view(bs, n_ctx, self.heads, -1)
-        q, k, v = torch.split(qkv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        x = self.c_qkv(x)
-        x = self.attention(x)
-        x = self.drop_path(self.c_proj(x))
-        return x
-
-
-class ResidualAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.attn = MultiheadAttention(
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
-        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        layers: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.ModuleList(
-            [
-                ResidualAttentionBlock(
-                    width=width,
-                    heads=heads,
-                    qkv_bias=qkv_bias,
-                    norm_layer=norm_layer,
-                    qk_norm=qk_norm,
-                    drop_path_rate=drop_path_rate
-                )
-                for _ in range(layers)
-            ]
-        )
-
-    def forward(self, x: torch.Tensor):
-        for block in self.resblocks:
-            x = block(x)
-        return x
-
-
-class CrossAttentionDecoder(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        out_channels: int,
-        fourier_embedder: FourierEmbedder,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        downsample_ratio: int = 1,
-        enable_ln_post: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary"
-    ):
-        super().__init__()
-
-        self.enable_ln_post = enable_ln_post
-        self.fourier_embedder = fourier_embedder
-        self.downsample_ratio = downsample_ratio
-        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
-        if self.downsample_ratio != 1:
-            self.latents_proj = ops.Linear(width * downsample_ratio, width)
-        if self.enable_ln_post == False:
-            qk_norm = False
-        self.cross_attn_decoder = ResidualCrossAttentionBlock(
-            width=width,
-            mlp_expand_ratio=mlp_expand_ratio,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm
-        )
-
-        if self.enable_ln_post:
-            self.ln_post = ops.LayerNorm(width)
-        self.output_proj = ops.Linear(width, out_channels)
-        self.label_type = label_type
-        self.count = 0
-
-    def forward(self, queries=None, query_embeddings=None, latents=None):
-        if query_embeddings is None:
-            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
-        self.count += query_embeddings.shape[1]
-        if self.downsample_ratio != 1:
-            latents = self.latents_proj(latents)
-        x = self.cross_attn_decoder(query_embeddings, latents)
-        if self.enable_ln_post:
-            x = self.ln_post(x)
-        occ = self.output_proj(x)
-        return occ
-
-
-class ShapeVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        embed_dim: int,
-        width: int,
-        heads: int,
-        num_decoder_layers: int,
-        geo_decoder_downsample_ratio: int = 1,
-        geo_decoder_mlp_expand_ratio: int = 4,
-        geo_decoder_ln_post: bool = True,
-        num_freqs: int = 8,
-        include_pi: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary",
-        drop_path_rate: float = 0.0,
-        scale_factor: float = 1.0,
-    ):
-        super().__init__()
-        self.geo_decoder_ln_post = geo_decoder_ln_post
-
-        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
-
-        self.post_kl = ops.Linear(embed_dim, width)
-
-        self.transformer = Transformer(
-            width=width,
-            layers=num_decoder_layers,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-
-        self.geo_decoder = CrossAttentionDecoder(
-            fourier_embedder=self.fourier_embedder,
-            out_channels=1,
-            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
-            downsample_ratio=geo_decoder_downsample_ratio,
-            enable_ln_post=self.geo_decoder_ln_post,
-            width=width // geo_decoder_downsample_ratio,
-            heads=heads // geo_decoder_downsample_ratio,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            label_type=label_type,
-        )
-
-        self.volume_decoder = VanillaVolumeDecoder()
-        self.scale_factor = scale_factor
-
-    def decode(self, latents, **kwargs):
-        latents = self.post_kl(latents.movedim(-2, -1))
-        latents = self.transformer(latents)
-
-        bounds = kwargs.get("bounds", 1.01)
-        num_chunks = kwargs.get("num_chunks", 8000)
-        octree_resolution = kwargs.get("octree_resolution", 256)
-        enable_pbar = kwargs.get("enable_pbar", True)
-
-        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
-        return grid_logits.movedim(-2, -1)
-
-    def encode(self, x):
-        return None
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -1,7 +1,6 @@
 #Based on Flux code because of weird hunyuan video code license.

 import torch
-import comfy.patcher_extension
 import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention
@@ -228,8 +227,6 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
-        guiding_frame_index=None,
-        ref_latent=None,
        control=None,
        transformer_options={},
    ) -> Tensor:
@@ -240,29 +237,12 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))

-        if ref_latent is not None:
-            ref_latent_ids = self.img_ids(ref_latent)
-            ref_latent = self.img_in(ref_latent)
-            img = torch.cat([ref_latent, img], dim=-2)
-            ref_latent_ids[..., 0] = -1
-            ref_latent_ids[..., 2] += (initial_shape[-1] // self.patch_size[-1])
-            img_ids = torch.cat([ref_latent_ids, img_ids], dim=-2)
-
-        if guiding_frame_index is not None:
-            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
-            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
-            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
-            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
-            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
-            modulation_dims_txt = [(0, None, 1)]
-        else:
-            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
-            modulation_dims = None
-            modulation_dims_txt = None
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])

        if self.params.guidance_embed:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

        if txt_mask is not None and not torch.is_floating_point(txt_mask):
            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max
@@ -285,14 +265,14 @@ class HunyuanVideo(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_i = control.get("input")
@@ -307,13 +287,13 @@ class HunyuanVideo(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out

-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_o = control.get("output")
@@ -323,20 +303,18 @@ class HunyuanVideo(nn.Module):
                        img[:, : img_len] += add

        img = img[:, : img_len]
-        if ref_latent is not None:
-            img = img[:, ref_latent.shape[1]:]

-        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)

        shape = initial_shape[-3:]
        for i in range(len(shape)):
            shape[i] = shape[i] // self.patch_size[i]
        img = img.reshape([img.shape[0]] + shape + [self.out_channels] + self.patch_size)
        img = img.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
+        img = img.reshape(initial_shape)
        return img

-    def img_ids(self, x):
+    def forward(self, x, timestep, context, y, guidance, attention_mask=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
@@ -346,18 +324,7 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
-        return repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
-
-    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
-        bs, c, t, h, w = x.shape
-        img_ids = self.img_ids(x)
+        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, control, transformer_options)
        return out
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn

 import comfy.ops
-from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
 from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from torch.utils import checkpoint

@@ -51,7 +51,7 @@ class HunYuanDiTBlock(nn.Module):
        if norm_type == "layer":
            norm_layer = operations.LayerNorm
        elif norm_type == "rms":
-            norm_layer = operations.RMSNorm
+            norm_layer = RMSNorm
        else:
            raise ValueError(f"Unknown norm_type: {norm_type}")

--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -1,13 +1,13 @@
 import torch
 from torch import nn
-import comfy.patcher_extension
 import comfy.ldm.modules.attention
+from comfy.ldm.genmo.joint_model.layers import RMSNorm
 import comfy.ldm.common_dit
 from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple

-from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from .symmetric_patchifier import SymmetricPatchifier


 def get_timestep_embedding(
@@ -262,8 +262,8 @@ class CrossAttention(nn.Module):
        self.heads = heads
        self.dim_head = dim_head

-        self.q_norm = operations.RMSNorm(inner_dim, eps=1e-5, dtype=dtype, device=device)
-        self.k_norm = operations.RMSNorm(inner_dim, eps=1e-5, dtype=dtype, device=device)
+        self.q_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.k_norm = RMSNorm(inner_dim, dtype=dtype, device=device)

        self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
@@ -377,16 +377,12 @@ class LTXVModel(torch.nn.Module):

                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
-                 causal_temporal_positioning=False,
-                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
-        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
-        self.causal_temporal_positioning = causal_temporal_positioning

        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)

@@ -420,30 +416,42 @@ class LTXVModel(torch.nn.Module):

        self.patchifier = SymmetricPatchifier(1)

-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, **kwargs)
-
-    def _forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})

+        indices_grid = self.patchifier.get_grid(
+            orig_num_frames=x.shape[2],
+            orig_height=x.shape[3],
+            orig_width=x.shape[4],
+            batch_size=x.shape[0],
+            scale_grid=((1 / frame_rate) * 8, 32, 32),
+            device=x.device,
+        )
+
+        if guiding_latent is not None:
+            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
+            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
+            ts *= input_ts
+            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
+            timestep = self.patchifier.patchify(ts)
+            input_x = x.clone()
+            x[:, :, 0] = guiding_latent[:, :, 0]
+            if guiding_latent_noise_scale > 0:
+                if self.generator is None:
+                    self.generator = torch.Generator(device=x.device).manual_seed(42)
+                elif self.generator.device != x.device:
+                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
+
+                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
+                scale = guiding_latent_noise_scale * (input_ts ** 2)
+                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
+
+                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
+
+
        orig_shape = list(x.shape)

-        x, latent_coords = self.patchifier.patchify(x)
-        pixel_coords = latent_to_pixel_coords(
-            latent_coords=latent_coords,
-            scale_factors=self.vae_scale_factors,
-            causal_fix=self.causal_temporal_positioning,
-        )
-
-        if keyframe_idxs is not None:
-            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
-
-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+        x = self.patchifier.patchify(x)

        x = self.patchify_proj(x)
        timestep = timestep * 1000.0
@@ -451,7 +459,7 @@ class LTXVModel(torch.nn.Module):
        if attention_mask is not None and not torch.is_floating_point(attention_mask):
            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max

-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)

        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@@ -511,4 +519,8 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )

+        if guiding_latent is not None:
+            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
+
+        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,29 +6,16 @@ from einops import rearrange
 from torch import Tensor


-def latent_to_pixel_coords(
-    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
-) -> Tensor:
-    """
-    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
-    configuration.
-    Args:
-        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
-        containing the latent corner coordinates of each token.
-        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
-        causal_fix (bool): Whether to take into account the different temporal scale
-            of the first frame. Default = False for backwards compatibility.
-    Returns:
-        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
-    """
-    pixel_coords = (
-        latent_coords
-        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
-    )
-    if causal_fix:
-        # Fix temporal scale for first frame to 1 due to causality
-        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
-    return pixel_coords
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    elif dims_to_append == 0:
+        return x
+    return x[(...,) + (None,) * dims_to_append]


 class Patchifier(ABC):
@@ -57,26 +44,29 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size

-    def get_latent_coords(
-        self, latent_num_frames, latent_height, latent_width, batch_size, device
+    def get_grid(
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
    ):
-        """
-        Return a tensor of shape [batch_size, 3, num_patches] containing the
-            top-left corner latent coordinates of each latent patch.
-        The tensor is repeated for each batch element.
-        """
-        latent_sample_coords = torch.meshgrid(
-            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
-            torch.arange(0, latent_height, self._patch_size[1], device=device),
-            torch.arange(0, latent_width, self._patch_size[2], device=device),
-            indexing="ij",
-        )
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-        latent_coords = rearrange(
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
-        )
-        return latent_coords
+        f = orig_num_frames // self._patch_size[0]
+        h = orig_height // self._patch_size[1]
+        w = orig_width // self._patch_size[2]
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
+        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
+        grid = torch.stack(grid, dim=0)
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+
+        if scale_grid is not None:
+            for i in range(3):
+                if isinstance(scale_grid[i], Tensor):
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
+                else:
+                    scale = scale_grid[i]
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
+
+        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
+        return grid


 class SymmetricPatchifier(Patchifier):
@@ -84,8 +74,6 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
-        b, _, f, h, w = latents.shape
-        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -93,7 +81,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents, latent_coords
+        return latents

    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -15,7 +15,6 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
-        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@@ -39,7 +38,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode=spatial_padding_mode,
+            padding_mode="zeros",
            groups=groups,
        )

--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,15 +1,13 @@
-from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
-
 ops = comfy.ops.disable_weight_init

 class Encoder(nn.Module):
@@ -34,7 +32,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """

    def __init__(
@@ -42,13 +40,12 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -68,7 +65,6 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.down_blocks = nn.ModuleList([])
@@ -86,7 +82,6 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -97,7 +92,6 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@@ -107,7 +101,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@@ -117,7 +110,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@@ -127,7 +119,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -138,34 +129,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_all_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_space_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_time_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@@ -189,18 +152,10 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
-        elif latent_log_var == "constant":
-            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            conv_out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -242,15 +197,6 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
-        elif self.latent_log_var == "constant":
-            sample = sample[:, :-1, ...]
-            approx_ln_0 = (
-                -30
-            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
-            sample = torch.cat(
-                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
-                dim=1,
-            )

        return sample

@@ -285,7 +231,7 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
@@ -293,7 +239,6 @@ class Decoder(nn.Module):
        norm_layer: str = "group_norm",
        causal: bool = True,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -319,7 +264,6 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.up_blocks = nn.ModuleList([])
@@ -339,7 +283,6 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "attn_res_x":
                block = UNetMidBlock3D(
@@ -351,7 +294,6 @@ class Decoder(nn.Module):
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    attention_head_dim=block_params["attention_head_dim"],
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@@ -364,21 +306,14 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=False,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                )
            elif block_name == "compress_all":
                output_channel = output_channel // block_params.get("multiplier", 1)
@@ -388,7 +323,6 @@ class Decoder(nn.Module):
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                    out_channels_reduction_factor=block_params.get("multiplier", 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@@ -406,13 +340,7 @@ class Decoder(nn.Module):

        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -505,12 +433,6 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
-        norm_layer (`str`, *optional*, defaults to `group_norm`):
-            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
-        inject_noise (`bool`, *optional*, defaults to `False`):
-            Whether to inject noise into the hidden states.
-        timestep_conditioning (`bool`, *optional*, defaults to `False`):
-            Whether to condition the hidden states on the timestep.

    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -529,7 +451,6 @@ class UNetMidBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
@@ -555,17 +476,13 @@ class UNetMidBlock3D(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=inject_noise,
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        causal: bool = True,
-        timestep: Optional[torch.Tensor] = None,
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
    ) -> torch.FloatTensor:
        timestep_embed = None
        if self.timestep_conditioning:
@@ -590,62 +507,9 @@ class UNetMidBlock3D(nn.Module):
        return hidden_states


-class SpaceToDepthDownsample(nn.Module):
-    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
-        super().__init__()
-        self.stride = stride
-        self.group_size = in_channels * math.prod(stride) // out_channels
-        self.conv = make_conv_nd(
-            dims=dims,
-            in_channels=in_channels,
-            out_channels=out_channels // math.prod(stride),
-            kernel_size=3,
-            stride=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
-        )
-
-    def forward(self, x, causal: bool = True):
-        if self.stride[0] == 2:
-            x = torch.cat(
-                [x[:, :, :1, :, :], x], dim=2
-            )  # duplicate first frames for padding
-
-        # skip connection
-        x_in = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
-        x_in = x_in.mean(dim=2)
-
-        # conv
-        x = self.conv(x, causal=causal)
-        x = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-
-        x = x + x_in
-
-        return x
-
-
 class DepthToSpaceUpsample(nn.Module):
    def __init__(
-        self,
-        dims,
-        in_channels,
-        stride,
-        residual=False,
-        out_channels_reduction_factor=1,
-        spatial_padding_mode="zeros",
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
    ):
        super().__init__()
        self.stride = stride
@@ -659,7 +523,6 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
@@ -695,7 +558,7 @@ class DepthToSpaceUpsample(nn.Module):
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)

    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@@ -728,7 +591,6 @@ class ResnetBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
@@ -755,7 +617,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -780,7 +641,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -941,44 +801,9 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)

 class VideoVAE(nn.Module):
-    def __init__(self, version=0, config=None):
+    def __init__(self, version=0):
        super().__init__()

-        if config is None:
-            config = self.guess_config(version)
-
-        self.timestep_conditioning = config.get("timestep_conditioning", False)
-        double_z = config.get("double_z", True)
-        latent_log_var = config.get(
-            "latent_log_var", "per_channel" if double_z else "none"
-        )
-
-        self.encoder = Encoder(
-            dims=config["dims"],
-            in_channels=config.get("in_channels", 3),
-            out_channels=config["latent_channels"],
-            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            latent_log_var=latent_log_var,
-            norm_layer=config.get("norm_layer", "group_norm"),
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
-        )
-
-        self.decoder = Decoder(
-            dims=config["dims"],
-            in_channels=config["latent_channels"],
-            out_channels=config.get("out_channels", 3),
-            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            norm_layer=config.get("norm_layer", "group_norm"),
-            causal=config.get("causal_decoder", False),
-            timestep_conditioning=self.timestep_conditioning,
-            spatial_padding_mode=config.get("spatial_padding_mode", "reflect"),
-        )
-
-        self.per_channel_statistics = processor()
-
-    def guess_config(self, version):
        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
@@ -1005,7 +830,7 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-        elif version == 1:
+        else:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@@ -1041,47 +866,37 @@ class VideoVAE(nn.Module):
                "causal_decoder": False,
                "timestep_conditioning": True,
            }
-        else:
-            config = {
-                "_class_name": "CausalVideoAutoencoder",
-                "dims": 3,
-                "in_channels": 3,
-                "out_channels": 3,
-                "latent_channels": 128,
-                "encoder_blocks": [
-                    ["res_x", {"num_layers": 4}],
-                    ["compress_space_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_time_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}]
-                ],
-                "decoder_blocks": [
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}]
-                ],
-                "scaling_factor": 1.0,
-                "norm_layer": "pixel_norm",
-                "patch_size": 4,
-                "latent_log_var": "uniform",
-                "use_quant_conv": False,
-                "causal_decoder": False,
-                "timestep_conditioning": True
-            }
-        return config
+
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+        )
+
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        self.per_channel_statistics = processor()

    def encode(self, x):
-        frames_count = x.shape[2]
-        if ((frames_count - 1) % 8) != 0:
-            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)

--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -17,11 +17,7 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
-    spatial_padding_mode="zeros",
-    temporal_padding_mode="zeros",
 ):
-    if not (spatial_padding_mode == temporal_padding_mode or causal):
-        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@@ -32,7 +28,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@@ -45,7 +40,6 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
-                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@@ -56,7 +50,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@@ -66,7 +59,6 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
pythongosssss	01110de8a3	Add tests for delete & update	2025-02-21 17:54:14 +00:00
pythongosssss	785a220757	refactor, adding tests	2025-02-16 17:22:48 +00:00
pythongosssss	b6b475191d	Add sqlite db	2025-01-30 21:48:53 +00:00