From 4c4a10d439afc72f29797974b836f91964a5cdb5 Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Wed, 6 Aug 2025 11:40:02 -0600 Subject: [PATCH] Remove vision model from qwen text encoder as it is not needed for image generation currently --- extensions_built_in/diffusion_models/qwen_image/qwen_image.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/extensions_built_in/diffusion_models/qwen_image/qwen_image.py b/extensions_built_in/diffusion_models/qwen_image/qwen_image.py index f7607ccb..bcd42ed3 100644 --- a/extensions_built_in/diffusion_models/qwen_image/qwen_image.py +++ b/extensions_built_in/diffusion_models/qwen_image/qwen_image.py @@ -130,6 +130,10 @@ class QwenImageModel(BaseModel): text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( base_model_path, subfolder="text_encoder", torch_dtype=dtype ) + + # remove the visual model as it is not needed for image generation + text_encoder.model.visual = None + text_encoder.to(self.device_torch, dtype=dtype) flush()