diff --git a/run.py b/run.py index 16d31312..91b69bc8 100644 --- a/run.py +++ b/run.py @@ -21,17 +21,6 @@ def print_end_message(jobs_completed, jobs_failed): print("========================================") -def run_job( - config: Union[str, dict, OrderedDict], - name=None -): - from toolkit.job import get_job - - job = get_job(config, name) - job.run() - job.cleanup() - - def main(): parser = argparse.ArgumentParser() diff --git a/test.py b/test.py new file mode 100644 index 00000000..f69c116b --- /dev/null +++ b/test.py @@ -0,0 +1,211 @@ +from collections import OrderedDict + +job_to_run = OrderedDict({ + # This is the config I use on my sliders, It is solid and tested + 'job': 'train', + 'config': { + # the name will be used to create a folder in the output folder + # it will also replace any [name] token in the rest of this config + 'name': 'detail_slider_v1', + # folder will be created with name above in folder below + # it can be relative to the project root or absolute + 'training_folder': "output/LoRA", + 'device': 'cuda', # cpu, cuda:0, etc + # for tensorboard logging, we will make a subfolder for this job + 'log_dir': "output/.tensorboard", + # you can stack processes for other jobs, It is not tested with sliders though + # just use one for now + 'process': { + 'type': 'slider', # tells runner to run the slider process + # network is the LoRA network for a slider, I recommend to leave this be + 'network': { + 'type': "lora", + # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good + 'linear': 8, # "rank" or "dim" + 'linear_alpha': 4, # Do about half of rank "alpha" + # 'conv': 4, # for convolutional layers "locon" + # 'conv_alpha': 4, # Do about half of conv "alpha" + }, + # training config + 'train': { + # this is also used in sampling. Stick with ddpm unless you know what you are doing + 'noise_scheduler': "ddpm", # or "ddpm", "lms", "euler_a" + # how many steps to train. More is not always better. I rarely go over 1000 + 'steps': 100, + # I have had good results with 4e-4 to 1e-4 at 500 steps + 'lr': 2e-4, + # enables gradient checkpoint, saves vram, leave it on + 'gradient_checkpointing': True, + # train the unet. I recommend leaving this true + 'train_unet': True, + # train the text encoder. I don't recommend this unless you have a special use case + # for sliders we are adjusting representation of the concept (unet), + # not the description of it (text encoder) + 'train_text_encoder': False, + + # just leave unless you know what you are doing + # also supports "dadaptation" but set lr to 1 if you use that, + # but it learns too fast and I don't recommend it + 'optimizer': "adamw", + # only constant for now + 'lr_scheduler': "constant", + # we randomly denoise random num of steps form 1 to this number + # while training. Just leave it + 'max_denoising_steps': 40, + # works great at 1. I do 1 even with my 4090. + # higher may not work right with newer single batch stacking code anyway + 'batch_size': 1, + # bf16 works best if your GPU supports it (modern) + 'dtype': 'bf16', # fp32, bf16, fp16 + # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX + # although, the way we train sliders is comparative, so it probably won't work anyway + 'noise_offset': 0.0, + }, + + # the model to train the LoRA network on + 'model': { + # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt + 'name_or_path': "runwayml/stable-diffusion-v1-5", + 'is_v2': False, # for v2 models + 'is_v_pred': False, # for v-prediction models (most v2 models) + # has some issues with the dual text encoder and the way we train sliders + # it works bit weights need to probably be higher to see it. + 'is_xl': False, # for SDXL models + }, + + # saving config + 'save': { + 'dtype': 'float16', # precision to save. I recommend float16 + 'save_every': 50, # save every this many steps + # this will remove step counts more than this number + # allows you to save more often in case of a crash without filling up your drive + 'max_step_saves_to_keep': 2, + }, + + # sampling config + 'sample': { + # must match train.noise_scheduler, this is not used here + # but may be in future and in other processes + 'sampler': "ddpm", + # sample every this many steps + 'sample_every': 20, + # image size + 'width': 512, + 'height': 512, + # prompts to use for sampling. Do as many as you want, but it slows down training + # pick ones that will best represent the concept you are trying to adjust + # allows some flags after the prompt + # --m [number] # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive + # slide are good tests. will inherit sample.network_multiplier if not set + # --n [string] # negative prompt, will inherit sample.neg if not set + # Only 75 tokens allowed currently + # I like to do a wide positive and negative spread so I can see a good range and stop + # early if the network is braking down + 'prompts': [ + "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5", + "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3", + "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3", + "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5", + "a golden retriever sitting on a leather couch, --m -5", + "a golden retriever sitting on a leather couch --m -3", + "a golden retriever sitting on a leather couch --m 3", + "a golden retriever sitting on a leather couch --m 5", + "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5", + "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3", + "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3", + "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5", + ], + # negative prompt used on all prompts above as default if they don't have one + 'neg': "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome", + # seed for sampling. 42 is the answer for everything + 'seed': 42, + # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc + # will start over on next sample_every so s1 is always seed + # works well if you use same prompt but want different results + 'walk_seed': False, + # cfg scale (4 to 10 is good) + 'guidance_scale': 7, + # sampler steps (20 to 30 is good) + 'sample_steps': 20, + # default network multiplier for all prompts + # since we are training a slider, I recommend overriding this with --m [number] + # in the prompts above to get both sides of the slider + 'network_multiplier': 1.0, + }, + + # logging information + 'logging': { + 'log_every': 10, # log every this many steps + 'use_wandb': False, # not supported yet + 'verbose': False, # probably done need unless you are debugging + }, + + # slider training config, best for last + 'slider': { + # resolutions to train on. [ width, height ]. This is less important for sliders + # as we are not teaching the model anything it doesn't already know + # but must be a size it understands [ 512, 512 ] for sd_v1.5 and [ 768, 768 ] for sd_v2.1 + # and [ 1024, 1024 ] for sd_xl + # you can do as many as you want here + 'resolutions': [ + [512, 512], + # [ 512, 768 ] + # [ 768, 768 ] + ], + # slider training uses 4 combined steps for a single round. This will do it in one gradient + # step. It is highly optimized and shouldn't take anymore vram than doing without it, + # since we break down batches for gradient accumulation now. so just leave it on. + 'batch_full_slide': True, + # These are the concepts to train on. You can do as many as you want here, + # but they can conflict outweigh each other. Other than experimenting, I recommend + # just doing one for good results + 'targets': [ + # target_class is the base concept we are adjusting the representation of + # for example, if we are adjusting the representation of a person, we would use "person" + # if we are adjusting the representation of a cat, we would use "cat" It is not + # a keyword necessarily but what the model understands the concept to represent. + # "person" will affect men, women, children, etc but will not affect cats, dogs, etc + # it is the models base general understanding of the concept and everything it represents + # you can leave it blank to affect everything. In this example, we are adjusting + # detail, so we will leave it blank to affect everything + { + 'target_class': "", + # positive is the prompt for the positive side of the slider. + # It is the concept that will be excited and amplified in the model when we slide the slider + # to the positive side and forgotten / inverted when we slide + # the slider to the negative side. It is generally best to include the target_class in + # the prompt. You want it to be the extreme of what you want to train on. For example, + # if you want to train on fat people, you would use "an extremely fat, morbidly obese person" + # as the prompt. Not just "fat person" + # max 75 tokens for now + 'positive': "high detail, 8k, intricate, detailed, high resolution, high res, high quality", + # negative is the prompt for the negative side of the slider and works the same as positive + # it does not necessarily work the same as a negative prompt when generating images + # these need to be polar opposites. + # max 76 tokens for now + 'negative': "blurry, boring, fuzzy, low detail, low resolution, low res, low quality", + # the loss for this target is multiplied by this number. + # if you are doing more than one target it may be good to set less important ones + # to a lower number like 0.1 so they don't outweigh the primary target + 'weight': 1.0, + }, + ], + }, + }, + }, + + # You can put any information you want here, and it will be saved in the model. + # The below is an example, but you can put your grocery list in it if you want. + # It is saved in the model so be aware of that. The software will include this + # plus some other information for you automatically + 'meta': { + # [name] gets replaced with the name above + 'name': "[name]", + 'version': '1.0', + # 'creator': { + # 'name': 'your name', + # 'email': 'your@gmail.com', + # 'website': 'https://your.website' + # } + } +}) \ No newline at end of file diff --git a/toolkit/job.py b/toolkit/job.py index 0741b952..dc274fb7 100644 --- a/toolkit/job.py +++ b/toolkit/job.py @@ -33,3 +33,12 @@ def get_job( # return TrainJob(config) else: raise ValueError(f'Unknown job type {job}') + + +def run_job( + config: Union[str, dict, OrderedDict], + name=None +): + job = get_job(config, name) + job.run() + job.cleanup()