diff --git a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/__init__.py b/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/__init__.py deleted file mode 100644 index 0a9b1cea..00000000 --- a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/__init__.py +++ /dev/null @@ -1,133 +0,0 @@ -import os -import cv2 -import torch - -from modules import devices -from modules.modelloader import load_file_from_url -from annotator.annotator_path import models_path -from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor - - -config_clip_g = { - "attention_dropout": 0.0, - "dropout": 0.0, - "hidden_act": "gelu", - "hidden_size": 1664, - "image_size": 224, - "initializer_factor": 1.0, - "initializer_range": 0.02, - "intermediate_size": 8192, - "layer_norm_eps": 1e-05, - "model_type": "clip_vision_model", - "num_attention_heads": 16, - "num_channels": 3, - "num_hidden_layers": 48, - "patch_size": 14, - "projection_dim": 1280, - "torch_dtype": "float32" -} - -config_clip_h = { - "attention_dropout": 0.0, - "dropout": 0.0, - "hidden_act": "gelu", - "hidden_size": 1280, - "image_size": 224, - "initializer_factor": 1.0, - "initializer_range": 0.02, - "intermediate_size": 5120, - "layer_norm_eps": 1e-05, - "model_type": "clip_vision_model", - "num_attention_heads": 16, - "num_channels": 3, - "num_hidden_layers": 32, - "patch_size": 14, - "projection_dim": 1024, - "torch_dtype": "float32" -} - -config_clip_vitl = { - "attention_dropout": 0.0, - "dropout": 0.0, - "hidden_act": "quick_gelu", - "hidden_size": 1024, - "image_size": 224, - "initializer_factor": 1.0, - "initializer_range": 0.02, - "intermediate_size": 4096, - "layer_norm_eps": 1e-05, - "model_type": "clip_vision_model", - "num_attention_heads": 16, - "num_channels": 3, - "num_hidden_layers": 24, - "patch_size": 14, - "projection_dim": 768, - "torch_dtype": "float32" -} - -configs = { - 'clip_g': config_clip_g, - 'clip_h': config_clip_h, - 'clip_vitl': config_clip_vitl, -} - -downloads = { - 'clip_vitl': 'https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin', - 'clip_g': 'https://huggingface.co/lllyasviel/Annotators/resolve/main/clip_g.pth', - 'clip_h': 'https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/pytorch_model.bin' -} - - -clip_vision_h_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_h_uc.data') -clip_vision_h_uc = torch.load(clip_vision_h_uc, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))['uc'] - -clip_vision_vith_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_vith_uc.data') -clip_vision_vith_uc = torch.load(clip_vision_vith_uc, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))['uc'] - - -class ClipVisionDetector: - def __init__(self, config, low_vram: bool): - assert config in downloads - self.download_link = downloads[config] - self.model_path = os.path.join(models_path, 'clip_vision') - self.file_name = config + '.pth' - self.config = configs[config] - self.device = ( - torch.device("cpu") if low_vram else - devices.get_device_for("controlnet") - ) - os.makedirs(self.model_path, exist_ok=True) - file_path = os.path.join(self.model_path, self.file_name) - if not os.path.exists(file_path): - load_file_from_url(url=self.download_link, model_dir=self.model_path, file_name=self.file_name) - config = CLIPVisionConfig(**self.config) - - self.model = CLIPVisionModelWithProjection(config) - self.processor = CLIPImageProcessor(crop_size=224, - do_center_crop=True, - do_convert_rgb=True, - do_normalize=True, - do_resize=True, - image_mean=[0.48145466, 0.4578275, 0.40821073], - image_std=[0.26862954, 0.26130258, 0.27577711], - resample=3, - size=224) - sd = torch.load(file_path, map_location=self.device) - self.model.load_state_dict(sd, strict=False) - del sd - self.model.to(self.device) - self.model.eval() - - def unload_model(self): - if self.model is not None: - self.model.to('meta') - - def __call__(self, input_image): - with torch.no_grad(): - input_image = cv2.resize(input_image, (224, 224), interpolation=cv2.INTER_AREA) - feat = self.processor(images=input_image, return_tensors="pt") - feat['pixel_values'] = feat['pixel_values'].to(self.device) - result = self.model(**feat, output_hidden_states=True) - result['hidden_states'] = [v.to(self.device) for v in result['hidden_states']] - result = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in result.items()} - return result diff --git a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_h_uc.data b/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_h_uc.data deleted file mode 100644 index 70c4a7bc..00000000 Binary files a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_h_uc.data and /dev/null differ diff --git a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_vith_uc.data b/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_vith_uc.data deleted file mode 100644 index 0c0a61af..00000000 Binary files a/extensions-builtin/forge_legacy_preprocessors/annotator/clipvision/clip_vision_vith_uc.data and /dev/null differ diff --git a/extensions-builtin/forge_legacy_preprocessors/legacy_preprocessors/preprocessor_compiled.py b/extensions-builtin/forge_legacy_preprocessors/legacy_preprocessors/preprocessor_compiled.py index e704be17..746d6896 100644 --- a/extensions-builtin/forge_legacy_preprocessors/legacy_preprocessors/preprocessor_compiled.py +++ b/extensions-builtin/forge_legacy_preprocessors/legacy_preprocessors/preprocessor_compiled.py @@ -393,86 +393,86 @@ legacy_preprocessors = { "Instant_ID" ] }, - "ip-adapter_clip_sd15": { - "label": "ip-adapter_clip_sd15", - "call_function": functools.partial(clip, config='clip_h'), - "unload_function": functools.partial(unload_clip, config='clip_h'), - "managed_model": "unknown", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 100, - "tags": [ - "IP-Adapter" - ] - }, - "ip-adapter_clip_sdxl": { - "label": "ip-adapter_clip_sdxl", - "call_function": functools.partial(clip, config='clip_g'), - "unload_function": functools.partial(unload_clip, config='clip_g'), - "managed_model": "unknown", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 0, - "tags": [ - "IP-Adapter" - ] - }, - "ip-adapter_clip_sdxl_plus_vith": { - "label": "ip-adapter_clip_sdxl_plus_vith", - "call_function": functools.partial(clip, config='clip_h'), - "unload_function": functools.partial(unload_clip, config='clip_h'), - "managed_model": "unknown", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 0, - "tags": [ - "IP-Adapter" - ] - }, - "ip-adapter_face_id": { - "label": "ip-adapter_face_id", - "call_function": g_insight_face_model.run_model, - "unload_function": None, - "managed_model": "g_insight_face_model", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 0, - "tags": [ - "IP-Adapter" - ] - }, - "ip-adapter_face_id_plus": { - "label": "ip-adapter_face_id_plus", - "call_function": face_id_plus, - "unload_function": functools.partial(unload_clip, config='clip_h'), - "managed_model": "unknown", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 0, - "tags": [ - "IP-Adapter" - ] - }, + # "ip-adapter_clip_sd15": { + # "label": "ip-adapter_clip_sd15", + # "call_function": functools.partial(clip, config='clip_h'), + # "unload_function": functools.partial(unload_clip, config='clip_h'), + # "managed_model": "unknown", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 100, + # "tags": [ + # "IP-Adapter" + # ] + # }, + # "ip-adapter_clip_sdxl": { + # "label": "ip-adapter_clip_sdxl", + # "call_function": functools.partial(clip, config='clip_g'), + # "unload_function": functools.partial(unload_clip, config='clip_g'), + # "managed_model": "unknown", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 0, + # "tags": [ + # "IP-Adapter" + # ] + # }, + # "ip-adapter_clip_sdxl_plus_vith": { + # "label": "ip-adapter_clip_sdxl_plus_vith", + # "call_function": functools.partial(clip, config='clip_h'), + # "unload_function": functools.partial(unload_clip, config='clip_h'), + # "managed_model": "unknown", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 0, + # "tags": [ + # "IP-Adapter" + # ] + # }, + # "ip-adapter_face_id": { + # "label": "ip-adapter_face_id", + # "call_function": g_insight_face_model.run_model, + # "unload_function": None, + # "managed_model": "g_insight_face_model", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 0, + # "tags": [ + # "IP-Adapter" + # ] + # }, + # "ip-adapter_face_id_plus": { + # "label": "ip-adapter_face_id_plus", + # "call_function": face_id_plus, + # "unload_function": functools.partial(unload_clip, config='clip_h'), + # "managed_model": "unknown", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 0, + # "tags": [ + # "IP-Adapter" + # ] + # }, "lineart_anime": { "label": "lineart_anime", "call_function": lineart_anime, @@ -1193,22 +1193,22 @@ legacy_preprocessors = { "T2I-Adapter" ] }, - "t2ia_style_clipvision": { - "label": "t2ia_style_clipvision", - "call_function": functools.partial(clip, config='clip_vitl'), - "unload_function": functools.partial(unload_clip, config='clip_vitl'), - "managed_model": "unknown", - "model_free": False, - "no_control_mode": True, - "resolution": None, - "slider_1": None, - "slider_2": None, - "slider_3": None, - "priority": 0, - "tags": [ - "T2I-Adapter" - ] - }, + # "t2ia_style_clipvision": { + # "label": "t2ia_style_clipvision", + # "call_function": functools.partial(clip, config='clip_vitl'), + # "unload_function": functools.partial(unload_clip, config='clip_vitl'), + # "managed_model": "unknown", + # "model_free": False, + # "no_control_mode": True, + # "resolution": None, + # "slider_1": None, + # "slider_2": None, + # "slider_3": None, + # "priority": 0, + # "tags": [ + # "T2I-Adapter" + # ] + # }, "threshold": { "label": "threshold", "call_function": threshold, diff --git a/extensions-builtin/forge_preprocessor_clipvision/scripts/preprocessor_clipvision.py b/extensions-builtin/forge_preprocessor_clipvision/scripts/preprocessor_clipvision.py index 74f3de22..248ca124 100644 --- a/extensions-builtin/forge_preprocessor_clipvision/scripts/preprocessor_clipvision.py +++ b/extensions-builtin/forge_preprocessor_clipvision/scripts/preprocessor_clipvision.py @@ -1,11 +1,51 @@ -from modules_forge.supported_preprocessor import Preprocessor, PreprocessorParameter +from modules_forge.supported_preprocessor import Preprocessor from modules_forge.shared import preprocessor_dir, add_supported_preprocessor from modules.modelloader import load_file_from_url +from modules_forge.forge_util import numpy_to_pytorch + +import ldm_patched.modules.clip_vision class PreprocessorClipVision(Preprocessor): - def __init__(self): + def __init__(self, name, url, filename): super().__init__() + self.name = name + self.url = url + self.filename = filename + self.tags = ['IP-Adapter'] + self.corp_image_with_a1111_mask_when_in_img2img_inpaint_tab = False + self.show_control_mode = False + self.sorting_priority = 1 + self.clipvision = None + + def __call__(self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs): + if self.clipvision is None: + ckpt_path = load_file_from_url( + url=self.url, + model_dir=preprocessor_dir, + file_name=self.filename + ) + self.clipvision = ldm_patched.modules.clip_vision.load(ckpt_path) + + input_image = numpy_to_pytorch(input_image).to(self.clipvision.patcher.current_device) + + return self.clipvision.encode_image(input_image) -add_supported_preprocessor(PreprocessorClipVision()) +add_supported_preprocessor(PreprocessorClipVision( + name='CLIP-ViT-H', + url='https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/model.safetensors', + filename='CLIP-ViT-H-14.safetensors' +)) + +add_supported_preprocessor(PreprocessorClipVision( + name='CLIP-ViT-bigG', + url='https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/model.safetensors', + filename='CLIP-ViT-bigG.safetensors' +)) + +add_supported_preprocessor(PreprocessorClipVision( + name='CLIP-ViT-L', + url='https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin', + filename='CLIP-ViT-bigG.safetensors' +)) diff --git a/modules_forge/patch_basic.py b/modules_forge/patch_basic.py index 44f534e5..21ef134e 100644 --- a/modules_forge/patch_basic.py +++ b/modules_forge/patch_basic.py @@ -1,5 +1,7 @@ import torch +import os import time +import safetensors import ldm_patched.modules.samplers from ldm_patched.modules.controlnet import ControlBase @@ -193,6 +195,40 @@ def patched_load_models_gpu(*args, **kwargs): return y +def build_loaded(module, loader_name): + original_loader_name = loader_name + '_origin' + + if not hasattr(module, original_loader_name): + setattr(module, original_loader_name, getattr(module, loader_name)) + + original_loader = getattr(module, original_loader_name) + + def loader(*args, **kwargs): + result = None + try: + result = original_loader(*args, **kwargs) + except Exception as e: + result = None + exp = str(e) + '\n' + for path in list(args) + list(kwargs.values()): + if isinstance(path, str): + if os.path.exists(path): + exp += f'File corrupted: {path} \n' + corrupted_backup_file = path + '.corrupted' + if os.path.exists(corrupted_backup_file): + os.remove(corrupted_backup_file) + os.replace(path, corrupted_backup_file) + if os.path.exists(path): + os.remove(path) + exp += f'Forge has tried to move the corrupted file to {corrupted_backup_file} \n' + exp += f'You may try again now and Fooocus will download models again. \n' + raise ValueError(exp) + return result + + setattr(module, loader_name, loader) + return + + def patch_all_basics(): if not hasattr(model_management, 'load_models_gpu_origin'): model_management.load_models_gpu_origin = model_management.load_models_gpu @@ -201,4 +237,7 @@ def patch_all_basics(): ControlBase.control_merge = patched_control_merge ldm_patched.modules.samplers.calc_cond_uncond_batch = patched_calc_cond_uncond_batch + + build_loaded(safetensors.torch, 'load_file') + build_loaded(torch, 'load') return diff --git a/modules_forge/supported_preprocessor.py b/modules_forge/supported_preprocessor.py index 7724f16d..09721f00 100644 --- a/modules_forge/supported_preprocessor.py +++ b/modules_forge/supported_preprocessor.py @@ -26,7 +26,7 @@ class Preprocessor: self.model_patcher: ModelPatcher = None self.show_control_mode = True self.do_not_need_model = False - self.sorting_priority = 0.0 # higher goes to top in the list + self.sorting_priority = 0 # higher goes to top in the list self.corp_image_with_a1111_mask_when_in_img2img_inpaint_tab = True def setup_model_patcher(self, model, load_device=None, offload_device=None, dtype=torch.float32, **kwargs):