From 34db804c76b75ccc32d685d7212462a5aaa2cbfc Mon Sep 17 00:00:00 2001
From: martintomov <101264514+martintomov@users.noreply.github.com>
Date: Fri, 23 Aug 2024 06:25:44 +0300
Subject: [PATCH] Modal cloud training support, fixed typo in
 toolkit/scheduler.py, Schnell training support for Colab, issue #92 , issue
 #114 (#115)

* issue #76, load_checkpoint_and_dispatch() 'force_hooks'

https://github.com/ostris/ai-toolkit/issues/76

* RunPod cloud config

https://github.com/ostris/ai-toolkit/issues/90

* change 2x A40 to 1x A40 and price per hour

referring to https://github.com/ostris/ai-toolkit/issues/90#issuecomment-2294894929

* include missed FLUX.1-schnell setup guide in last commit

* huggingface-cli login required auth

* #92 peft, #114 colab, schnell training in colab

* modal cloud - run_modal.py and .yaml configs

* run_modal.py mount path example

* modal_examples renamed to modal

* Training in Modal README.md setup guide

* rename run command in title for consistency
---
 README.md                                     |  72 ++++-
 .../modal/modal_train_lora_flux_24gb.yaml     |  96 ++++++
 .../modal_train_lora_flux_schnell_24gb.yaml   |  98 ++++++
 notebooks/FLUX_1_dev_LoRA_Training.ipynb      | 166 +++++-----
 notebooks/FLUX_1_schnell_LoRA_Training.ipynb  | 296 ++++++++++++++++++
 requirements.txt                              |   1 +
 run_modal.py                                  | 175 +++++++++++
 toolkit/scheduler.py                          |   2 +-
 8 files changed, 817 insertions(+), 89 deletions(-)
 create mode 100644 config/examples/modal/modal_train_lora_flux_24gb.yaml
 create mode 100644 config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml
 create mode 100644 notebooks/FLUX_1_schnell_LoRA_Training.ipynb
 create mode 100644 run_modal.py

diff --git a/README.md b/README.md
index c054b142..45c94b57 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,7 @@ Please do not open a bug report unless it is a bug in the code. You are welcome
 and ask for help there. However, please refrain from PMing me directly with general question or support. Ask in the discord
 and I will answer when I can.
 
-### Training in RunPod cloud
+## Training in RunPod
 Example RunPod template: **runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04**
 > You need a minimum of 24GB VRAM, pick a GPU by your preference.
 
@@ -142,26 +142,72 @@ pip install -r requirements.txt
 pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
 ```
 ### 2. Upload your dataset
-- Create a new folder in the root, name it `dataset` or whatever you like
-- Drag and drop your .jpg and .txt files inside the newly created dataset folder
+- Create a new folder in the root, name it `dataset` or whatever you like.
+- Drag and drop your .jpg, .jpeg, or .png images and .txt files inside the newly created dataset folder.
 
 ### 3. Login into Hugging Face with an Access Token
-- Get a READ token from [here](https://huggingface.co/settings/tokens)
-- Run ```huggingface-cli login``` and paste your token
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run ```huggingface-cli login``` and paste your token.
 
 ### 4. Training
-- Copy an example config file located at ```config/examples``` to the config folder and rename it to ```whatever_you_want.yml```
-- Edit the config following the comments in the file
-- Change ```folder_path: "/path/to/images/folder"``` to your dataset path like ```folder_path: "/workspace/ai-toolkit/your-dataset"```
-- Run the file: ```python run.py config/whatever_you_want.yml```
+- Copy an example config file located at ```config/examples``` to the config folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file.
+- Change ```folder_path: "/path/to/images/folder"``` to your dataset path like ```folder_path: "/workspace/ai-toolkit/your-dataset"```.
+- Run the file: ```python run.py config/whatever_you_want.yml```.
 
 ### Screenshot from RunPod
 <img width="1728" alt="RunPod Training Screenshot" src="https://github.com/user-attachments/assets/53a1b8ef-92fa-4481-81a7-bde45a14a7b5">
 
-<!---
-### Training in the cloud
-Coming very soon. Getting base out then will have a notebook that makes all that work. 
--->
+## Training in Modal
+
+### 1. Setup
+#### ai-toolkit:
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+#### Modal:
+- Run `pip install modal` to install the modal Python package.
+- Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`).
+
+#### Hugging Face:
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run `huggingface-cli login` and paste your token.
+
+### 2. Upload your dataset
+- Drag and drop your dataset folder containing the .jpg, .jpeg, or .png images and .txt files in `ai-toolkit`.
+
+### 3. Configs
+- Copy an example config file located at ```config/examples/modal``` to the `config` folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file, **<ins>be careful and follow the example `/root/ai-toolkit` paths</ins>**.
+
+### 4. Edit run_modal.py
+- Set your entire local `ai-toolkit` path at `code_mount = modal.Mount.from_local_dir` like:
+  
+   ```
+   code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit")
+   ```
+- Choose a `GPU` and `Timeout` in `@app.function` _(default is A100 40GB and 2 hour timeout)_.
+
+### 5. Training
+- Run the config file in your terminal: `modal run run_modal.py --config-file-list-str=/root/ai-toolkit/config/whatever_you_want.yml`.
+- You can monitor your training in your local terminal, or on [modal.com](https://modal.com/).
+- Models, samples and optimizer will be stored in `Storage > flux-lora-models`.
+
+### 6. Saving the model
+- Check contents of the volume by running `modal volume ls flux-lora-models`. 
+- Download the content by running `modal volume get flux-lora-models your-model-name`.
+- Example: `modal volume get flux-lora-models my_first_flux_lora_v1`.
+
+### Screenshot from Modal
+
+<img width="1728" alt="Modal Traning Screenshot" src="https://github.com/user-attachments/assets/7497eb38-0090-49d6-8ad9-9c8ea7b5388b">
 
 ---
 
diff --git a/config/examples/modal/modal_train_lora_flux_24gb.yaml b/config/examples/modal/modal_train_lora_flux_24gb.yaml
new file mode 100644
index 00000000..51873de0
--- /dev/null
+++ b/config/examples/modal/modal_train_lora_flux_24gb.yaml
@@ -0,0 +1,96 @@
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the model locally and
+        # place it like "/root/ai-toolkit/FLUX.1-dev"
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'
diff --git a/config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml b/config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml
new file mode 100644
index 00000000..6d1e964f
--- /dev/null
+++ b/config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml
@@ -0,0 +1,98 @@
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the models locally and
+        # place them like "/root/ai-toolkit/FLUX.1-schnell" and "/root/ai-toolkit/FLUX.1-schnell-training-adapter"
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'
diff --git a/notebooks/FLUX_1_dev_LoRA_Training.ipynb b/notebooks/FLUX_1_dev_LoRA_Training.ipynb
index d3174813..8cfcd1fe 100644
--- a/notebooks/FLUX_1_dev_LoRA_Training.ipynb
+++ b/notebooks/FLUX_1_dev_LoRA_Training.ipynb
@@ -1,53 +1,45 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "machine_shape": "hm",
-      "gpuType": "A100"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
       "cell_type": "markdown",
-      "source": [
-        "# AI Toolkit by Ostris\n",
-        "## FLUX.1 Training\n"
-      ],
       "metadata": {
         "collapsed": false,
         "id": "zl-S0m3pkQC5"
-      }
+      },
+      "source": [
+        "# AI Toolkit by Ostris\n",
+        "## FLUX.1-dev Training\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
       "source": [
-        "!git clone https://github.com/ostris/ai-toolkit\n",
-        "!mkdir -p /content/dataset"
-      ],
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "BvAG0GKAh59G"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/ostris/ai-toolkit\n",
+        "!mkdir -p /content/dataset"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "Put your image dataset in the `/content/dataset` folder"
-      ],
       "metadata": {
         "id": "UFUW4ZMmnp1V"
-      }
+      },
+      "source": [
+        "Put your image dataset in the `/content/dataset` folder"
+      ]
     },
     {
       "cell_type": "code",
@@ -62,6 +54,9 @@
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "OV0HnOI6o8V6"
+      },
       "source": [
         "## Model License\n",
         "Training currently only works with FLUX.1-dev. Which means anything you train will inherit the non-commercial license. It is also a gated model, so you need to accept the license on HF before using it. Otherwise, this will fail. Here are the required steps to setup a license.\n",
@@ -69,13 +64,15 @@
         "Sign into HF and accept the model access here [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)\n",
         "\n",
         "[Get a READ key from huggingface](https://huggingface.co/settings/tokens/new?) and place it in the next cell after running it."
-      ],
-      "metadata": {
-        "id": "OV0HnOI6o8V6"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3yZZdhFRoj2m"
+      },
+      "outputs": [],
       "source": [
         "import getpass\n",
         "import os\n",
@@ -87,15 +84,15 @@
         "os.environ['HF_TOKEN'] = hf_token\n",
         "\n",
         "print(\"HF_TOKEN environment variable has been set.\")"
-      ],
-      "metadata": {
-        "id": "3yZZdhFRoj2m"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9gO2EzQ1kQC8"
+      },
+      "outputs": [],
       "source": [
         "import os\n",
         "import sys\n",
@@ -105,26 +102,26 @@
         "from PIL import Image\n",
         "import os\n",
         "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\""
-      ],
-      "metadata": {
-        "id": "9gO2EzQ1kQC8"
-      },
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "N8UUFzVRigbC"
+      },
       "source": [
         "## Setup\n",
         "\n",
         "This is your config. It is documented pretty well. Normally you would do this as a yaml file, but for colab, this will work. This will run as is without modification, but feel free to edit as you want."
-      ],
-      "metadata": {
-        "id": "N8UUFzVRigbC"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_t28QURYjRQO"
+      },
+      "outputs": [],
       "source": [
         "from collections import OrderedDict\n",
         "\n",
@@ -169,7 +166,7 @@
         "                ]),\n",
         "                ('train', OrderedDict([\n",
         "                    ('batch_size', 1),\n",
-        "                    ('steps', 4000),  # total number of steps to train 500 - 4000 is a good range\n",
+        "                    ('steps', 2000),  # total number of steps to train 500 - 4000 is a good range\n",
         "                    ('gradient_accumulation_steps', 1),\n",
         "                    ('train_unet', True),\n",
         "                    ('train_text_encoder', False),  # probably won't work with flux\n",
@@ -177,9 +174,16 @@
         "                    ('gradient_checkpointing', True),  # need the on unless you have a ton of vram\n",
         "                    ('noise_scheduler', 'flowmatch'),  # for training only\n",
         "                    ('optimizer', 'adamw8bit'),\n",
-        "                    ('lr', 4e-4),\n",
+        "                    ('lr', 1e-4),\n",
+        "\n",
         "                    # uncomment this to skip the pre training sample\n",
-        "                    #('skip_first_sample', True),\n",
+        "                    # ('skip_first_sample', True),\n",
+        "\n",
+        "                    # uncomment to completely disable sampling\n",
+        "                    # ('disable_sampling', True),\n",
+        "\n",
+        "                    # uncomment to use new vell curved weighting. Experimental but may produce better results\n",
+        "                    # ('linear_timesteps', True),\n",
         "\n",
         "                    # ema will smooth out learning, but could slow it down. Recommended to leave on.\n",
         "                    ('ema_config', OrderedDict([\n",
@@ -231,45 +235,57 @@
         "        ('version', '1.0')\n",
         "    ]))\n",
         "])\n"
-      ],
-      "metadata": {
-        "id": "_t28QURYjRQO"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "h6F1FlM2Wb3l"
+      },
       "source": [
         "## Run it\n",
         "\n",
         "Below does all the magic. Check your folders to the left. Items will be in output/LoRA/your_name_v1 In the samples folder, there are preiodic sampled. This doesnt work great with colab. They will be in /content/output"
-      ],
-      "metadata": {
-        "id": "h6F1FlM2Wb3l"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "run_job(job_to_run)\n"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "HkajwI8gteOh"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "run_job(job_to_run)\n"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Hblgb5uwW5SD"
+      },
       "source": [
         "## Done\n",
         "\n",
         "Check your ourput dir and get your slider\n"
-      ],
-      "metadata": {
-        "id": "Hblgb5uwW5SD"
-      }
+      ]
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "A100",
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/FLUX_1_schnell_LoRA_Training.ipynb b/notebooks/FLUX_1_schnell_LoRA_Training.ipynb
new file mode 100644
index 00000000..652d8ccc
--- /dev/null
+++ b/notebooks/FLUX_1_schnell_LoRA_Training.ipynb
@@ -0,0 +1,296 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false,
+        "id": "zl-S0m3pkQC5"
+      },
+      "source": [
+        "# AI Toolkit by Ostris\n",
+        "## FLUX.1-schnell Training\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3cokMT-WC6rG"
+      },
+      "outputs": [],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "BvAG0GKAh59G"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/ostris/ai-toolkit\n",
+        "!mkdir -p /content/dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UFUW4ZMmnp1V"
+      },
+      "source": [
+        "Put your image dataset in the `/content/dataset` folder"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "XGZqVER_aQJW"
+      },
+      "outputs": [],
+      "source": [
+        "!cd ai-toolkit && git submodule update --init --recursive && pip install -r requirements.txt\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OV0HnOI6o8V6"
+      },
+      "source": [
+        "## Model License\n",
+        "Training currently only works with FLUX.1-dev. Which means anything you train will inherit the non-commercial license. It is also a gated model, so you need to accept the license on HF before using it. Otherwise, this will fail. Here are the required steps to setup a license.\n",
+        "\n",
+        "Sign into HF and accept the model access here [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)\n",
+        "\n",
+        "[Get a READ key from huggingface](https://huggingface.co/settings/tokens/new?) and place it in the next cell after running it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3yZZdhFRoj2m"
+      },
+      "outputs": [],
+      "source": [
+        "import getpass\n",
+        "import os\n",
+        "\n",
+        "# Prompt for the token\n",
+        "hf_token = getpass.getpass('Enter your HF access token and press enter: ')\n",
+        "\n",
+        "# Set the environment variable\n",
+        "os.environ['HF_TOKEN'] = hf_token\n",
+        "\n",
+        "print(\"HF_TOKEN environment variable has been set.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "9gO2EzQ1kQC8"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "sys.path.append('/content/ai-toolkit')\n",
+        "from toolkit.job import run_job\n",
+        "from collections import OrderedDict\n",
+        "from PIL import Image\n",
+        "import os\n",
+        "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "N8UUFzVRigbC"
+      },
+      "source": [
+        "## Setup\n",
+        "\n",
+        "This is your config. It is documented pretty well. Normally you would do this as a yaml file, but for colab, this will work. This will run as is without modification, but feel free to edit as you want."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "_t28QURYjRQO"
+      },
+      "outputs": [],
+      "source": [
+        "from collections import OrderedDict\n",
+        "\n",
+        "job_to_run = OrderedDict([\n",
+        "    ('job', 'extension'),\n",
+        "    ('config', OrderedDict([\n",
+        "        # this name will be the folder and filename name\n",
+        "        ('name', 'my_first_flux_lora_v1'),\n",
+        "        ('process', [\n",
+        "            OrderedDict([\n",
+        "                ('type', 'sd_trainer'),\n",
+        "                # root folder to save training sessions/samples/weights\n",
+        "                ('training_folder', '/content/output'),\n",
+        "                # uncomment to see performance stats in the terminal every N steps\n",
+        "                #('performance_log_every', 1000),\n",
+        "                ('device', 'cuda:0'),\n",
+        "                # if a trigger word is specified, it will be added to captions of training data if it does not already exist\n",
+        "                # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word\n",
+        "                # ('trigger_word', 'image'),\n",
+        "                ('network', OrderedDict([\n",
+        "                    ('type', 'lora'),\n",
+        "                    ('linear', 16),\n",
+        "                    ('linear_alpha', 16)\n",
+        "                ])),\n",
+        "                ('save', OrderedDict([\n",
+        "                    ('dtype', 'float16'),  # precision to save\n",
+        "                    ('save_every', 250),  # save every this many steps\n",
+        "                    ('max_step_saves_to_keep', 4)  # how many intermittent saves to keep\n",
+        "                ])),\n",
+        "                ('datasets', [\n",
+        "                    # datasets are a folder of images. captions need to be txt files with the same name as the image\n",
+        "                    # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently\n",
+        "                    # images will automatically be resized and bucketed into the resolution specified\n",
+        "                    OrderedDict([\n",
+        "                        ('folder_path', '/content/dataset'),\n",
+        "                        ('caption_ext', 'txt'),\n",
+        "                        ('caption_dropout_rate', 0.05),  # will drop out the caption 5% of time\n",
+        "                        ('shuffle_tokens', False),  # shuffle caption order, split by commas\n",
+        "                        ('cache_latents_to_disk', True),  # leave this true unless you know what you're doing\n",
+        "                        ('resolution', [512, 768, 1024])  # flux enjoys multiple resolutions\n",
+        "                    ])\n",
+        "                ]),\n",
+        "                ('train', OrderedDict([\n",
+        "                    ('batch_size', 1),\n",
+        "                    ('steps', 2000),  # total number of steps to train 500 - 4000 is a good range\n",
+        "                    ('gradient_accumulation_steps', 1),\n",
+        "                    ('train_unet', True),\n",
+        "                    ('train_text_encoder', False),  # probably won't work with flux\n",
+        "                    ('gradient_checkpointing', True),  # need the on unless you have a ton of vram\n",
+        "                    ('noise_scheduler', 'flowmatch'),  # for training only\n",
+        "                    ('optimizer', 'adamw8bit'),\n",
+        "                    ('lr', 1e-4),\n",
+        "\n",
+        "                    # uncomment this to skip the pre training sample\n",
+        "                    # ('skip_first_sample', True),\n",
+        "\n",
+        "                    # uncomment to completely disable sampling\n",
+        "                    # ('disable_sampling', True),\n",
+        "\n",
+        "                    # uncomment to use new vell curved weighting. Experimental but may produce better results\n",
+        "                    # ('linear_timesteps', True),\n",
+        "\n",
+        "                    # ema will smooth out learning, but could slow it down. Recommended to leave on.\n",
+        "                    ('ema_config', OrderedDict([\n",
+        "                        ('use_ema', True),\n",
+        "                        ('ema_decay', 0.99)\n",
+        "                    ])),\n",
+        "\n",
+        "                    # will probably need this if gpu supports it for flux, other dtypes may not work correctly\n",
+        "                    ('dtype', 'bf16')\n",
+        "                ])),\n",
+        "                ('model', OrderedDict([\n",
+        "                    # huggingface model name or path\n",
+        "                    ('name_or_path', 'black-forest-labs/FLUX.1-schnell'),\n",
+        "                    ('assistant_lora_path', 'ostris/FLUX.1-schnell-training-adapter'), # Required for flux schnell training\n",
+        "                    ('is_flux', True),\n",
+        "                    ('quantize', True),  # run 8bit mixed precision\n",
+        "                    # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary\n",
+        "                    #('low_vram', True),  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.\n",
+        "                ])),\n",
+        "                ('sample', OrderedDict([\n",
+        "                    ('sampler', 'flowmatch'),  # must match train.noise_scheduler\n",
+        "                    ('sample_every', 250),  # sample every this many steps\n",
+        "                    ('width', 1024),\n",
+        "                    ('height', 1024),\n",
+        "                    ('prompts', [\n",
+        "                        # you can add [trigger] to the prompts here and it will be replaced with the trigger word\n",
+        "                        #'[trigger] holding a sign that says \\'I LOVE PROMPTS!\\'',\n",
+        "                        'woman with red hair, playing chess at the park, bomb going off in the background',\n",
+        "                        'a woman holding a coffee cup, in a beanie, sitting at a cafe',\n",
+        "                        'a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini',\n",
+        "                        'a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background',\n",
+        "                        'a bear building a log cabin in the snow covered mountains',\n",
+        "                        'woman playing the guitar, on stage, singing a song, laser lights, punk rocker',\n",
+        "                        'hipster man with a beard, building a chair, in a wood shop',\n",
+        "                        'photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop',\n",
+        "                        'a man holding a sign that says, \\'this is a sign\\'',\n",
+        "                        'a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle'\n",
+        "                    ]),\n",
+        "                    ('neg', ''),  # not used on flux\n",
+        "                    ('seed', 42),\n",
+        "                    ('walk_seed', True),\n",
+        "                    ('guidance_scale', 1), # schnell does not do guidance\n",
+        "                    ('sample_steps', 4) # 1 - 4 works well\n",
+        "                ]))\n",
+        "            ])\n",
+        "        ])\n",
+        "    ])),\n",
+        "    # you can add any additional meta info here. [name] is replaced with config name at top\n",
+        "    ('meta', OrderedDict([\n",
+        "        ('name', '[name]'),\n",
+        "        ('version', '1.0')\n",
+        "    ]))\n",
+        "])\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "h6F1FlM2Wb3l"
+      },
+      "source": [
+        "## Run it\n",
+        "\n",
+        "Below does all the magic. Check your folders to the left. Items will be in output/LoRA/your_name_v1 In the samples folder, there are preiodic sampled. This doesnt work great with colab. They will be in /content/output"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HkajwI8gteOh"
+      },
+      "outputs": [],
+      "source": [
+        "run_job(job_to_run)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Hblgb5uwW5SD"
+      },
+      "source": [
+        "## Done\n",
+        "\n",
+        "Check your ourput dir and get your slider\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "A100",
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/requirements.txt b/requirements.txt
index 119eeee0..1b03df7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,3 +29,4 @@ pytorch_fid
 optimum-quanto
 sentencepiece
 huggingface_hub
+peft
\ No newline at end of file
diff --git a/run_modal.py b/run_modal.py
new file mode 100644
index 00000000..4675c1cb
--- /dev/null
+++ b/run_modal.py
@@ -0,0 +1,175 @@
+'''
+
+ostris/ai-toolkit on https://modal.com
+Run training with the following command:
+modal run run_modal.py --config-file-list-str=/root/ai-toolkit/config/whatever_you_want.yml
+
+'''
+
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+import sys
+import modal
+from dotenv import load_dotenv
+# Load the .env file if it exists
+load_dotenv()
+
+sys.path.insert(0, "/root/ai-toolkit")
+# must come before ANY torch or fastai imports
+# import toolkit.cuda_malloc
+
+# turn off diffusers telemetry until I can figure out how to make it opt-in
+os.environ['DISABLE_TELEMETRY'] = 'YES'
+
+# define the volume for storing model outputs, using "creating volumes lazily": https://modal.com/docs/guide/volumes
+# you will find your model, samples and optimizer stored in: https://modal.com/storage/your-username/main/flux-lora-models
+model_volume = modal.Volume.from_name("flux-lora-models", create_if_missing=True)
+
+# modal_output, due to "cannot mount volume on non-empty path" requirement
+MOUNT_DIR = "/root/ai-toolkit/modal_output"  # modal_output, due to "cannot mount volume on non-empty path" requirement
+
+# define modal app
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    # install required system and pip packages, more about this modal approach: https://modal.com/docs/examples/dreambooth_app
+    .apt_install("libgl1", "libglib2.0-0")
+    .pip_install(
+        "python-dotenv",
+        "torch", 
+        "diffusers[torch]", 
+        "transformers", 
+        "ftfy", 
+        "torchvision", 
+        "oyaml", 
+        "opencv-python", 
+        "albumentations",
+        "safetensors",
+        "lycoris-lora==1.8.3",
+        "flatten_json",
+        "pyyaml",
+        "tensorboard", 
+        "kornia", 
+        "invisible-watermark", 
+        "einops", 
+        "accelerate", 
+        "toml", 
+        "pydantic",
+        "omegaconf",
+        "k-diffusion",
+        "open_clip_torch",
+        "timm",
+        "prodigyopt",
+        "controlnet_aux==0.0.7",
+        "bitsandbytes",
+        "hf_transfer",
+        "lpips", 
+        "pytorch_fid", 
+        "optimum-quanto", 
+        "sentencepiece", 
+        "huggingface_hub", 
+        "peft"
+    )
+)
+
+# mount for the entire ai-toolkit directory
+# example: "/Users/username/ai-toolkit" is the local directory, "/root/ai-toolkit" is the remote directory
+code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit")
+
+# create the Modal app with the necessary mounts and volumes
+app = modal.App(name="flux-lora-training", image=image, mounts=[code_mount], volumes={MOUNT_DIR: model_volume})
+
+# Check if we have DEBUG_TOOLKIT in env
+if os.environ.get("DEBUG_TOOLKIT", "0") == "1":
+    # Set torch to trace mode
+    import torch
+    torch.autograd.set_detect_anomaly(True)
+
+import argparse
+from toolkit.job import get_job
+
+def print_end_message(jobs_completed, jobs_failed):
+    failure_string = f"{jobs_failed} failure{'' if jobs_failed == 1 else 's'}" if jobs_failed > 0 else ""
+    completed_string = f"{jobs_completed} completed job{'' if jobs_completed == 1 else 's'}"
+
+    print("")
+    print("========================================")
+    print("Result:")
+    if len(completed_string) > 0:
+        print(f" - {completed_string}")
+    if len(failure_string) > 0:
+        print(f" - {failure_string}")
+    print("========================================")
+
+
+@app.function(
+    # request a GPU with at least 24GB VRAM
+    # more about modal GPU's: https://modal.com/docs/guide/gpu
+    gpu="A100", # gpu="H100"
+    # more about modal timeouts: https://modal.com/docs/guide/timeouts
+    timeout=7200  # 2 hours, increase or decrease if needed
+)
+def main(config_file_list_str: str, recover: bool = False, name: str = None):
+    # convert the config file list from a string to a list
+    config_file_list = config_file_list_str.split(",")
+
+    jobs_completed = 0
+    jobs_failed = 0
+
+    print(f"Running {len(config_file_list)} job{'' if len(config_file_list) == 1 else 's'}")
+
+    for config_file in config_file_list:
+        try:
+            job = get_job(config_file, name)
+            
+            job.config['process'][0]['training_folder'] = MOUNT_DIR
+            os.makedirs(MOUNT_DIR, exist_ok=True)
+            print(f"Training outputs will be saved to: {MOUNT_DIR}")
+            
+            # run the job
+            job.run()
+            
+            # commit the volume after training
+            model_volume.commit()
+            
+            job.cleanup()
+            jobs_completed += 1
+        except Exception as e:
+            print(f"Error running job: {e}")
+            jobs_failed += 1
+            if not recover:
+                print_end_message(jobs_completed, jobs_failed)
+                raise e
+
+    print_end_message(jobs_completed, jobs_failed)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # require at least one config file
+    parser.add_argument(
+        'config_file_list',
+        nargs='+',
+        type=str,
+        help='Name of config file (eg: person_v1 for config/person_v1.json/yaml), or full path if it is not in config folder, you can pass multiple config files and run them all sequentially'
+    )
+
+    # flag to continue if a job fails
+    parser.add_argument(
+        '-r', '--recover',
+        action='store_true',
+        help='Continue running additional jobs even if a job fails'
+    )
+
+    # optional name replacement for config file
+    parser.add_argument(
+        '-n', '--name',
+        type=str,
+        default=None,
+        help='Name to replace [name] tag in config file, useful for shared config file'
+    )
+    args = parser.parse_args()
+
+    # convert list of config files to a comma-separated string for Modal compatibility
+    config_file_list_str = ",".join(args.config_file_list)
+
+    main.call(config_file_list_str=config_file_list_str, recover=args.recover, name=args.name)
diff --git a/toolkit/scheduler.py b/toolkit/scheduler.py
index 95ae36d8..f6f8f61a 100644
--- a/toolkit/scheduler.py
+++ b/toolkit/scheduler.py
@@ -26,7 +26,7 @@ def get_lr_scheduler(
             optimizer, **kwargs
         )
     elif name == "constant":
-        if 'facor' not in kwargs:
+        if 'factor' not in kwargs:
             kwargs['factor'] = 1.0
 
         return torch.optim.lr_scheduler.ConstantLR(optimizer, **kwargs)