ai-toolkit/config/examples/train_slider.example.yml

---
# This is in yaml format. You can use json if you prefer
# I like both but yaml is easier to write
# Plus it has comments which is nice for documentation
# This is the config I use on my sliders, It is solid and tested
job: train
config:
  # the name will be used to create a folder in the output folder
  # it will also replace any [name] token in the rest of this config
  name: pet_slider_v1
  # folder will be created with name above in folder below
  # it can be relative to the project root or absolute
  training_folder: "output/LoRA"
  device: cuda:0 # cpu, cuda:0, etc
  # for tensorboard logging, we will make a subfolder for this job
  log_dir: "output/.tensorboard"
  # you can stack processes for other jobs, It is not tested with sliders though
  # just use one for now
  process:
    - type: slider # tells runner to run the slider process
      # network is the LoRA network for a slider, I recommend to leave this be
      network:
        # network type lierla is traditional LoRA that works everywhere, only linear layers
        type: "lierla"
        # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good
        rank: 8
        alpha: 1.0 # just leave it

      # training config
      train:
        # this is also used in sampling. Stick with ddpm unless you know what you are doing
        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
        # how many steps to train. More is not always better. I rarely go over 1000
        steps: 500
        # I have had good results with 4e-4 to 1e-4 at 500 steps
        lr: 1e-4
        # enables gradient checkpoint, saves vram, leave it on
        gradient_checkpointing: true
        # train the unet. I recommend leaving this true
        train_unet: true
        # train the text encoder. I don't recommend this unless you have a special use case
        # for sliders we are adjusting representation of the concept (unet),
        # not the description of it (text encoder)
        train_text_encoder: false

        # just leave unless you know what you are doing
        # also supports "dadaptation" but set lr to 1 if you use that,
        # but it learns too fast and I don't recommend it
        optimizer: "adamw"
        # only constant for now
        lr_scheduler: "constant"
        # we randomly denoise random num of steps form 1 to this number
        # while training. Just leave it
        max_denoising_steps: 40
        # works great at 1. I do 1 even with my 4090.
        batch_size: 1
        # bf16 works best if your GPU supports it (modern)
        dtype: bf16  # fp32, bf16, fp16
        # if you have it, use it. It is faster and better
        xformers: true
        # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX
        # although, the way we train sliders is comparative, so it probably won't work anyway
        noise_offset: 0.0

      # the model to train the LoRA network on
      model:
        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
        name_or_path: "runwayml/stable-diffusion-v1-5"
        is_v2: false  # for v2 models
        is_v_pred: false # for v-prediction models (most v2 models)
        is_xl: false  # for SDXL models

      # saving config
      save:
        dtype: float16 # precision to save. I recommend float16
        save_every: 50 # save every this many steps

      # sampling config
      sample:
        # must match train.noise_scheduler, this is not used here
        # but may be in future and in other processes
        sampler: "ddpm"
        # sample every this many steps
        sample_every: 20
        # image size
        width: 512
        height: 512
        # prompts to use for sampling. Do as many as you want, but it slows down training
        # pick ones that will best represent the concept you are trying to adjust
        # allows some flags after the prompt
        #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive
        #      slide are good tests. will inherit sample.network_multiplier if not set
        #  --n [string]  # negative prompt, will inherit sample.neg if not set

        # Only 75 tokens allowed currently
        prompts:  # our example is an animal slider, neg: dog, pos: cat
          - "a golden retriever --m -5"
          - "a golden retriever --m -3"
          - "a golden retriever --m 3"
          - "a golden retriever --m 5"
          - "calico cat --m -5"
          - "calico cat --m -3"
          - "calico cat --m 3"
          - "calico cat --m 5"
          - "an elephant --m -5"
          - "an elephant --m -3"
          - "an elephant --m 3"
          - "an elephant --m 5"
        # negative prompt used on all prompts above as default if they don't have one
        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome"
        # seed for sampling. 42 is the answer for everything
        seed: 42
        # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc
        # will start over on next sample_every so s1 is always seed
        # works well if you use same prompt but want different results
        walk_seed: false
        # cfg scale (4 to 10 is good)
        guidance_scale: 7
        # sampler steps (20 to 30 is good)
        sample_steps: 20
        # default network multiplier for all prompts
        # since we are training a slider, I recommend overriding this with --m [number]
        # in the prompts above to get both sides of the slider
        network_multiplier: 1.0

      # logging information
      logging:
        log_every: 10 # log every this many steps
        use_wandb: false # not supported yet
        verbose: false # probably done need unless you are debugging

      # slider training config, best for last
      slider:
        # resolutions to train on. [ width, height ]. This is less important for sliders
        # as we are not teaching the model anything it doesn't already know
        # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1
        # you can do as many as you want here
        resolutions:
          - [ 512, 512 ]
#          - [ 512, 768 ]
#          - [ 768, 768 ]
        # These are the concepts to train on. You can do as many as you want here,
        # but they can conflict outweigh each other. Other than experimenting, I recommend
        # just doing one for good results
        targets:
            # target_class is the base concept we are adjusting the representation of
            # for example, if we are adjusting the representation of a person, we would use "person"
            # if we are adjusting the representation of a cat, we would use "cat" It is not
            # a keyword necessarily but what the model understands the concept to represent.
            # "person" will affect men, women, children, etc but will not affect cats, dogs, etc
            # it is the models base general understanding of the concept and everything it represents
          - target_class: "animal"
            # positive is the prompt for the positive side of the slider.
            # It is the concept that will be excited and amplified in the model when we slide the slider
            # to the positive side and forgotten / inverted when we slide
            # the slider to the negative side. It is generally best to include the target_class in
            # the prompt. You want it to be the extreme of what you want to train on. For example,
            # if you want to train on fat people, you would use "an extremely fat, morbidly obese person"
            # as the prompt. Not just "fat person"
            positive: "cat"
            # negative is the prompt for the negative side of the slider and works the same as positive
            # it does not necessarily work the same as a negative prompt when generating images
            negative: "dog"
            # the loss for this target is multiplied by this number.
            # if you are doing more than one target it may be good to set less important ones
            # to a lower number like 0.1 so they dont outweigh the primary target
            weight: 1.0

        # anchors are prompts that wer try to hold on to while training the slider
        # you want these to generate an image very similar to the target_class
        # without directly overlapping it. For example, if you are training on a person smiling,
        # you would use "a person with a face mask" as an anchor. It is a person, the image is the same
        # regardless if they are smiling or not
        anchors:
          # only positive prompt for now
          - prompt: "a woman"
            neg_prompt: "animal"
            # the multiplier applied to the LoRA when this is run.
            # higher will give it more weight but also help keep the lora from collapsing
            multiplier: 8.0
          - prompt: "a man"
            neg_prompt: "animal"
            multiplier: 8.0
          - prompt: "a person"
            neg_prompt: "animal"
            multiplier: 8.0

# You can put any information you want here, and it will be saved in the model.
# The below is an example, but you can put your grocery list in it if you want.
# It is saved in the model so be aware of that. The software will include this
# plus some other information for you automatically
meta:
  # [name] gets replaced with the name above
  name: "[name]"
#  version: '1.0'
#  creator:
#    name: Your Name
#    email: your@gmail.com
#    website: https://your.website