diff --git a/README.md b/README.md index 6d9b50e..ed0496f 100644 --- a/README.md +++ b/README.md @@ -23,19 +23,18 @@ Download the following models from HuggingFace into `models` directory: This code also works for [Stability AI SD3 Medium](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/sd3_medium.safetensors). #### ControlNets -Optionally, download [SD3.5 ControlNets](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets): -(a) [Blur ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/sd3.5_large_controlnet_blur.safetensors) -(b) [Canny ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/sd3.5_large_controlnet_canny.safetensors) -(c) [Depth ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/sd3.5_large_controlnet_depth.safetensors) -For example: + +Optionally, download [SD3.5 Large ControlNets](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets): +(a) [Blur ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/blur_8b.safetensors) +(b) [Canny ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/canny_8b.safetensors) +(c) [Depth ControlNet](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets/resolve/main/depth_8b.safetensors) + ```py from huggingface_hub import hf_hub_download +hf_hub_download("stabilityai/stable-diffusion-3.5-controlnets", "sd3.5_large_controlnet_blur.safetensors", local_dir="models") hf_hub_download("stabilityai/stable-diffusion-3.5-controlnets", "sd3.5_large_controlnet_canny.safetensors", local_dir="models") +hf_hub_download("stabilityai/stable-diffusion-3.5-controlnets", "sd3.5_large_controlnet_depth.safetensors", local_dir="models") ``` - ### Install @@ -77,9 +76,18 @@ python3 sd3_infer.py --prompt path/to/my_prompts.txt --model models/sd3.5_medium #### ControlNets -To use ControlNets, also download your chosen ControlNet model from the [model repository](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets), then run inference, like so: +To use SD3.5 Large ControlNets, additionally download your chosen ControlNet model from the [model repository](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets), then run inference, like so: +(a) Blur: +```sh +python sd3_infer.py --model models/sd3.5_large.safetensors --controlnet_ckpt models/sd3.5_large_controlnet_blur.safetensors --controlnet_cond_image inputs/blur.png --prompt "generated ai art, a tiny, lost rubber ducky in an action shot close-up, surfing the humongous waves, inside the tube, in the style of Kelly Slater" +``` +(b) Canny: +```sh +python sd3_infer.py --model models/sd3.5_large.safetensors --controlnet_ckpt models/sd3.5_large_controlnet_canny.safetensors --controlnet_cond_image inputs/canny.png --prompt "A Night time photo taken by Leica M11, portrait of a Japanese woman in a kimono, looking at the camera, Cherry blossoms" +``` +(c) Depth: ```sh -python sd3_infer.py --controlnet_ckpt models/sd3.5_large_controlnet_canny.safetensors --controlnet_cond_image inputs/canny.png --prompt "A Night time photo taken by Leica M11, portrait of a Japanese woman in a loose and half worn kimono showing her bare shoulders, looking at the camera, Cherry blossoms" +python sd3_infer.py --model models/sd3.5_large.safetensors --controlnet_ckpt models/sd3.5_large_controlnet_depth.safetensors --controlnet_cond_image inputs/depth.png --prompt "photo of woman, presumably in her mid-thirties, striking a balanced yoga pose on a rocky outcrop during dusk or dawn. She wears a light gray t-shirt and dark leggings. Her pose is dynamic, with one leg extended backward and the other bent at the knee, holding the moon close to her hand." ``` For details on preprocessing for each of the ControlNets, and examples, please review the [model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-controlnets). diff --git a/sd3_infer.py b/sd3_infer.py index a30b2d0..96480aa 100644 --- a/sd3_infer.py +++ b/sd3_infer.py @@ -14,13 +14,11 @@ import fire import numpy as np +import sd3_impls import torch +from other_impls import SD3Tokenizer, SDClipModel, SDXLClipG, T5XXLModel from PIL import Image from safetensors import safe_open -from tqdm import tqdm - -import sd3_impls -from other_impls import SD3Tokenizer, SDClipModel, SDXLClipG, T5XXLModel from sd3_impls import ( SDVAE, BaseModel, @@ -28,6 +26,7 @@ SD3LatentFormat, SkipLayerCFGDenoiser, ) +from tqdm import tqdm ################################################################################################# ### Wrappers for model parts @@ -236,7 +235,7 @@ def __init__(self, model, dtype: torch.dtype = torch.float16): # ControlNet CONTROLNET_COND_IMAGE = None # If init_image is given, this is the percentage of denoising steps to run (1.0 = full denoise, 0.0 = no denoise at all) -DENOISE = 0.6 +DENOISE = 0.8 # Output file path OUTDIR = "outputs" # SAMPLER @@ -396,7 +395,7 @@ def vae_encode( image_torch = torch.from_numpy(batch_images).cuda() if using_2b_controlnet: image_torch = image_torch * 2.0 - 1.0 - elif controlnet_type == 1: # canny + elif controlnet_type == 1: # canny image_torch = image_torch * 255 * 0.5 + 0.5 else: image_torch = 2.0 * image_torch - 1.0 @@ -503,14 +502,14 @@ def gen_image( CONFIGS = { "sd3_medium": { "shift": 1.0, - "cfg": 5.0, "steps": 50, + "cfg": 5.0, "sampler": "dpmpp_2m", }, "sd3.5_medium": { "shift": 3.0, - "cfg": 5.0, "steps": 50, + "cfg": 5.0, "sampler": "dpmpp_2m", "skip_layer_config": { "scale": 2.5, @@ -522,11 +521,29 @@ def gen_image( }, "sd3.5_large": { "shift": 3.0, - "cfg": 4.5, "steps": 40, + "cfg": 4.5, "sampler": "dpmpp_2m", }, "sd3.5_large_turbo": {"shift": 3.0, "cfg": 1.0, "steps": 4, "sampler": "euler"}, + "sd3.5_large_controlnet_blur": { + "shift": 3.0, + "steps": 60, + "cfg": 3.5, + "sampler": "euler", + }, + "sd3.5_large_controlnet_canny": { + "shift": 3.0, + "steps": 60, + "cfg": 3.5, + "sampler": "euler", + }, + "sd3.5_large_controlnet_depth": { + "shift": 3.0, + "steps": 60, + "cfg": 3.5, + "sampler": "euler", + }, } @@ -556,18 +573,13 @@ def main( **kwargs, ): assert not kwargs, f"Unknown arguments: {kwargs}" - steps = steps or CONFIGS.get(os.path.splitext(os.path.basename(model))[0], {}).get( - "steps", 50 - ) - cfg = cfg or CONFIGS.get(os.path.splitext(os.path.basename(model))[0], {}).get( - "cfg", 5 - ) - shift = shift or CONFIGS.get(os.path.splitext(os.path.basename(model))[0], {}).get( - "shift", 3 - ) - sampler = sampler or CONFIGS.get( - os.path.splitext(os.path.basename(model))[0], {} - ).get("sampler", "dpmpp_2m") + + config = CONFIGS.get(os.path.splitext(os.path.basename(model))[0], {}) + _shift = shift or config.get("shift", 3) + _steps = steps or config.get("steps", 50) + _cfg = cfg or config.get("cfg", 5) + _sampler = sampler or config.get("sampler", "dpmpp_2m") + if skip_layer_cfg: skip_layer_config = CONFIGS.get( os.path.splitext(os.path.basename(model))[0], {} @@ -576,12 +588,21 @@ def main( else: skip_layer_config = {} + if controlnet_ckpt is not None: + controlnet_config = CONFIGS.get( + os.path.splitext(os.path.basename(controlnet_ckpt))[0], {} + ) + _shift = shift or controlnet_config.get("shift", shift) + _steps = steps or controlnet_config.get("steps", steps) + _cfg = cfg or controlnet_config.get("cfg", cfg) + _sampler = sampler or controlnet_config.get("sampler", sampler) + inferencer = SD3Inferencer() inferencer.load( model, vae, - shift, + _shift, controlnet_ckpt, model_folder, text_encoder_device, @@ -616,9 +637,9 @@ def main( prompts, width, height, - steps, - cfg, - sampler, + _steps, + _cfg, + _sampler, seed, seed_type, out_dir,