From 11fa71445814ac382f0ec481852bcc0d0cf6846f Mon Sep 17 00:00:00 2001 From: Thomas Mello Date: Thu, 6 Oct 2022 20:59:22 +0300 Subject: [PATCH 01/14] docs: add info about docker hub images (#1447) --- docs/3.docker-guide.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/3.docker-guide.md b/docs/3.docker-guide.md index 6f9d3b6..662b8d4 100644 --- a/docs/3.docker-guide.md +++ b/docs/3.docker-guide.md @@ -19,6 +19,34 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . --> +## Running prebuilt image + +The easiest way to run Stable Diffusion WebUI is to use the prebuilt image from Docker Hub. + +```bash +docker pull hlky/sd-webui:runpod +``` +This image has all the necessary models baked in. It is quite large but streamlines the process of managing the various models and simplifies the user experience. + +Alternatively you can pull: +```bash +docker pull hlky/sd-webui:latest +``` +This image includes the babrebones environment to run the Web UI. The models will be downloaded during the installation process. You will have to take care of the volume for the `sd/models` directory. + + +It is recommended that you run the `runpod` version. +You can run the image using the following command: +```bash +docker container run --rm -d -p 8501:8501 -e STREAMLIT_SERVER_HEADLESS=true -e "WEBUI_SCRIPT=webui_streamlit.py" -e "VALIDATE_MODELS=false" -v "${PWD}/outputs:/sd/outputs" --gpus all hlky/sd-webui:runpod +``` + +> Note: if you are running it on runpod it only supports one volume mount which is used for your outputs. + +> Note: if you are running it on your local machine the output directory will be created in the current directory from where you run this command. + +## Building the image + This Docker environment is intended to speed up development and testing of Stable Diffusion WebUI features. Use of a container image format allows for packaging and isolation of Stable Diffusion / WebUI's dependencies separate from the Host environment. You can use this Dockerfile to build a Docker image and run Stable Diffusion WebUI locally. From d4823e89f8d65293fb3fa1cd6b6f95766c2f462a Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Thu, 6 Oct 2022 12:21:54 -0700 Subject: [PATCH 02/14] Fixed wrong links to gfpgan v1.3 on the Readme.md instea of using the v1.4 links. --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index ccb49e2..67f2ce3 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ ## [Visit sd-webui's Discord Server](https://discord.gg/gyXNe4NySY) [![Discord Server](https://user-images.githubusercontent.com/5977640/190528254-9b5b4423-47ee-4f24-b4f9-fd13fba37518.png)](https://discord.gg/gyXNe4NySY) ## Installation instructions for: + - **[Windows](https://sd-webui.github.io/stable-diffusion-webui/docs/1.windows-installation.html)** - **[Linux](https://sd-webui.github.io/stable-diffusion-webui/docs/2.linux-installation.html)** @@ -20,7 +21,8 @@ Come to our [Discord Server](https://discord.gg/gyXNe4NySY) or use [Discussions] Check the [Contribution Guide](CONTRIBUTING.md) -[sd-webui](https://github.com/sd-webui) is: +[sd-webui](https://github.com/sd-webui) main devs: + * ![hlky's avatar](https://avatars.githubusercontent.com/u/106811348?s=40&v=4) [hlky](https://github.com/hlky) * ![ZeroCool940711's avatar](https://avatars.githubusercontent.com/u/5977640?s=40&v=4)[ZeroCool940711](https://github.com/ZeroCool940711) * ![codedealer's avatar](https://avatars.githubusercontent.com/u/4258136?s=40&v=4)[codedealer](https://github.com/codedealer) @@ -28,13 +30,21 @@ Check the [Contribution Guide](CONTRIBUTING.md) ### Project Features: * Two great Web UI's to choose from: Streamlit or Gradio + * No more manually typing parameters, now all you have to do is write your prompt and adjust sliders + * Built-in image enhancers and upscalers, including GFPGAN and realESRGAN + * Run additional upscaling models on CPU to save VRAM + * Textual inversion 🔥: [info](https://textual-inversion.github.io/) - requires enabling, see [here](https://github.com/hlky/sd-enable-textual-inversion), script works as usual without it enabled + * Advanced img2img editor with Mask and crop capabilities + * Mask painting 🖌️: Powerful tool for re-generating only specific parts of an image you want to change (currently Gradio only) + * More diffusion samplers 🔥🔥: A great collection of samplers to use, including: + - `k_euler` (Default) - `k_lms` - `k_euler_a` @@ -45,21 +55,33 @@ Check the [Contribution Guide](CONTRIBUTING.md) - `DDIM` * Loopback ➿: Automatically feed the last generated sample back into img2img + * Prompt Weighting 🏋️: Adjust the strength of different terms in your prompt + * Selectable GPU usage with `--gpu ` + * Memory Monitoring 🔥: Shows VRAM usage and generation time after outputting + * Word Seeds 🔥: Use words instead of seed numbers + * CFG: Classifier free guidance scale, a feature for fine-tuning your output + * Automatic Launcher: Activate conda and run Stable Diffusion with a single command + * Lighter on VRAM: 512x512 Text2Image & Image2Image tested working on 4GB + * Prompt validation: If your prompt is too long, you will get a warning in the text output field + * Copy-paste generation parameters: A text output provides generation parameters in an easy to copy-paste form for easy sharing. + * Correct seeds for batches: If you use a seed of 1000 to generate two batches of two images each, four generated images will have seeds: `1000, 1001, 1002, 1003`. + * Prompt matrix: Separate multiple prompts using the `|` character, and the system will produce an image for every combination of them. + * Loopback for Image2Image: A checkbox for img2img allowing to automatically feed output image as input for the next batch. Equivalent to saving output image, and replacing input image with it. - # Stable Diffusion Web UI + A fully-integrated and easy way to work with Stable Diffusion right from a browser window. ## Streamlit @@ -67,6 +89,7 @@ A fully-integrated and easy way to work with Stable Diffusion right from a brows ![](images/streamlit/streamlit-t2i.png) **Features:** + - Clean UI with an easy to use design, with support for widescreen displays. - Dynamic live preview of your generations - Easily customizable presets right from the WebUI (Coming Soon!) @@ -80,12 +103,12 @@ A fully-integrated and easy way to work with Stable Diffusion right from a brows Please see the [Streamlit Documentation](docs/4.streamlit-interface.md) to learn more. - ## Gradio ![](images/gradio/gradio-t2i.png) **Features:** + - Older UI design that is fully functional and feature complete. - Has access to all upscaling models, including LSDR. - Dynamic prompt entry automatically changes your generation settings based on `--params` in a prompt. @@ -94,7 +117,6 @@ Please see the [Streamlit Documentation](docs/4.streamlit-interface.md) to learn Please see the [Gradio Documentation](docs/5.gradio-interface.md) to learn more. - ## Image Upscalers --- @@ -106,7 +128,7 @@ Please see the [Gradio Documentation](docs/5.gradio-interface.md) to learn more. Lets you improve faces in pictures using the GFPGAN model. There is a checkbox in every tab to use GFPGAN at 100%, and also a separate tab that just allows you to use GFPGAN on any picture, with a slider that controls how strong the effect is. If you want to use GFPGAN to improve generated faces, you need to install it separately. -Download [GFPGANv1.3.pth](https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth) and put it +Download [GFPGANv1.4.pth](https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/GFPGANv1.4.pth) and put it into the `/stable-diffusion-webui/models/gfpgan` directory. ### RealESRGAN @@ -119,18 +141,26 @@ There is also a separate tab for using RealESRGAN on any picture. Download [RealESRGAN_x4plus.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth) and [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth). Put them into the `stable-diffusion-webui/models/realesrgan` directory. -### GoBig, LSDR, and GoLatent *(Currently Gradio Only)* + + +### LSDR + +Download **LDSR** [project.yaml](https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1) and [model last.cpkt](https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1). Rename last.ckpt to model.ckpt and place both under `stable-diffusion-webui/models/ldsr/` + +### GoBig, and GoLatent *(Currently on the Gradio version Only)* More powerful upscalers that uses a seperate Latent Diffusion model to more cleanly upscale images. -Download **LDSR** [project.yaml](https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1) and [ model last.cpkt](https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1). Rename last.ckpt to model.ckpt and place both under stable-diffusion-webui/models/ldsr/ + Please see the [Image Enhancers Documentation](docs/5.image_enhancers.md) to learn more. ----- ### *Original Information From The Stable Diffusion Repo* + # Stable Diffusion + *Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:* [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)
@@ -144,7 +174,6 @@ Please see the [Image Enhancers Documentation](docs/5.image_enhancers.md) to lea which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/). - [Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion model. Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. @@ -164,15 +193,14 @@ then finetuned on 512x512 images. in its training data. Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion). -## Comments +## Comments - Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion) -and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). -Thanks for open-sourcing! + and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). + Thanks for open-sourcing! - The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). - ## BibTeX ``` @@ -186,5 +214,3 @@ Thanks for open-sourcing! } ``` - - From 849b0f92a9821ad733f67acc6361ee48bd906522 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Thu, 6 Oct 2022 17:57:11 -0700 Subject: [PATCH 03/14] Removed trailing spaces on the config file. --- configs/webui/webui_streamlit.yaml | 105 ++++++++++++++--------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/configs/webui/webui_streamlit.yaml b/configs/webui/webui_streamlit.yaml index a0870af..f826b4d 100644 --- a/configs/webui/webui_streamlit.yaml +++ b/configs/webui/webui_streamlit.yaml @@ -12,7 +12,7 @@ # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# along with this program. If not, see . # UI defaults configuration file. It is automatically loaded if located at configs/webui/webui_streamlit.yaml. # Any changes made here will be available automatically on the web app without having to stop it. @@ -24,7 +24,7 @@ general: huggingface_token: "" gpu: 0 outdir: outputs - default_model: "Stable Diffusion v1.4" + default_model: "Stable Diffusion v1.4" default_model_config: "configs/stable-diffusion/v1-inference.yaml" default_model_path: "models/ldm/stable-diffusion-v1/model.ckpt" use_sd_concepts_library: True @@ -69,13 +69,13 @@ txt2img: min_value: 64 max_value: 2048 step: 64 - + height: value: 512 min_value: 64 max_value: 2048 step: 64 - + cfg_scale: value: 7.5 min_value: 1.0 @@ -85,16 +85,16 @@ txt2img: seed: "" batch_count: value: 1 - + batch_size: value: 1 - + sampling_steps: value: 30 min_value: 10 max_value: 250 step: 10 - + LDSR_config: sampling_steps: 50 preDownScale: 1 @@ -115,16 +115,16 @@ txt2img: use_LDSR: False RealESRGAN_model: "RealESRGAN_x4plus" use_upscaling: False - + variant_amount: value: 0.0 min_value: 0.0 max_value: 1.0 step: 0.01 - + variant_seed: "" write_info_files: True - + txt2vid: default_model: "CompVis/stable-diffusion-v1-4" custom_models_list: ["CompVis/stable-diffusion-v1-4"] @@ -134,37 +134,37 @@ txt2vid: min_value: 64 max_value: 2048 step: 64 - + height: value: 512 min_value: 64 max_value: 2048 step: 64 - + cfg_scale: value: 7.5 min_value: 1.0 max_value: 30.0 step: 0.5 - + batch_count: value: 1 - + batch_size: value: 1 - + sampling_steps: value: 30 min_value: 10 max_value: 250 step: 10 - + num_inference_steps: value: 200 min_value: 10 max_value: 500 step: 10 - + seed: "" default_sampler: "k_euler" scheduler_name: "klms" @@ -188,36 +188,36 @@ txt2vid: min_value: 0.0 max_value: 1.0 step: 0.01 - + variant_seed: "" - + beta_start: value: 0.00085 min_value: 0.0001 max_value: 0.0300 step: 0.0001 format: "%.5f" - + beta_end: value: 0.012 min_value: 0.0001 max_value: 0.0300 step: 0.0001 format: "%.5f" - + beta_scheduler_type: "scaled_linear" max_frames: 100 - + LDSR_config: sampling_steps: 50 preDownScale: 1 postDownScale: 1 downsample_method: "Lanczos" - + img2img: - prompt: + prompt: sampler_name: "k_euler" - denoising_strength: + denoising_strength: value: 0.75 min_value: 0.0 max_value: 1.0 @@ -238,49 +238,49 @@ img2img: min_value: 64 max_value: 2048 step: 64 - + height: value: 512 min_value: 64 max_value: 2048 step: 64 - + cfg_scale: value: 7.5 min_value: 1.0 max_value: 30.0 step: 0.5 - + batch_count: value: 1 - + batch_size: value: 1 - + sampling_steps: value: 30 min_value: 10 max_value: 250 step: 10 - + num_inference_steps: value: 200 min_value: 10 max_value: 500 step: 10 - + find_noise_steps: value: 100 min_value: 0 max_value: 500 step: 10 - + LDSR_config: sampling_steps: 50 preDownScale: 1 postDownScale: 1 downsample_method: "Lanczos" - + loopback: True random_seed_loopback: True separate_prompts: False @@ -298,36 +298,36 @@ img2img: variant_amount: 0.0 variant_seed: "" write_info_files: True - + img2txt: batch_size: 420 blip_image_eval_size: 512 keep_all_models_loaded: False - + concepts_library: concepts_per_page: 12 - + gfpgan: strength: 100 textual_inversion: pretrained_model_name_or_path: "models/diffusers/stable-diffusion-v1-4" tokenizer_name: "models/clip-vit-large-patch14" - - + + daisi_app: running_on_daisi_io: False - -model_manager: + +model_manager: models: - stable_diffusion: + stable_diffusion: model_name: "Stable Diffusion v1.4" save_location: "./models/ldm/stable-diffusion-v1" files: model_ckpt: file_name: "model.ckpt" download_link: "https://www.googleapis.com/storage/v1/b/aai-blog-files/o/sd-v1-4.ckpt?alt=media" - + gfpgan: model_name: "GFPGAN" save_location: "./models/gfpgan" @@ -343,8 +343,8 @@ model_manager: file_name: "parsing_parsenet.pth" save_location: "./gfpgan/weights" download_link: "https://github.com/xinntao/facexlib/releases/download/v0.2.2/parsing_parsenet.pth" - - + + realesrgan: model_name: "RealESRGAN" save_location: "./models/realesrgan" @@ -355,8 +355,8 @@ model_manager: x4plus_anime_6b: file_name: "RealESRGAN_x4plus_anime_6B.pth" download_link: "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth" - - + + waifu_diffusion: model_name: "Waifu Diffusion v1.2" save_location: "./models/custom" @@ -365,7 +365,7 @@ model_manager: file_name: "waifu-diffusion.ckpt" download_link: "https://huggingface.co/crumb/pruned-waifu-diffusion/resolve/main/model-pruned.ckpt" - + trinart_stable_diffusion: model_name: "TrinArt Stable Diffusion v2" save_location: "./models/custom" @@ -373,7 +373,7 @@ model_manager: trinart: file_name: "trinart.ckpt" download_link: "https://huggingface.co/naclbit/trinart_stable_diffusion_v2/resolve/main/trinart2_step95000.ckpt" - + stable_diffusion_concept_library: model_name: "Stable Diffusion Concept Library" save_location: "./models/custom/sd-concepts-library/" @@ -381,7 +381,7 @@ model_manager: concept_library: file_name: "" download_link: "https://github.com/sd-webui/sd-concepts-library" - + blip_model: model_name: "Blip Model" save_location: "./models/blip" @@ -389,7 +389,7 @@ model_manager: blip: file_name: "model__base_caption.pth" download_link: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth" - + ldsr: model_name: "Latent Diffusion Super Resolution (LDSR)" save_location: "./models/ldsr" @@ -397,8 +397,7 @@ model_manager: project_yaml: file_name: "project.yaml" download_link: "https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1" - + ldsr_model: file_name: "model.ckpt" download_link: "https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1" - \ No newline at end of file From c8740fadb4fc01fccb2444c1870ee4fe36799f7e Mon Sep 17 00:00:00 2001 From: cstueckrath Date: Fri, 7 Oct 2022 08:12:28 +0200 Subject: [PATCH 04/14] fix type error --- scripts/Settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Settings.py b/scripts/Settings.py index 4aa9f0c..8730f69 100644 --- a/scripts/Settings.py +++ b/scripts/Settings.py @@ -270,7 +270,7 @@ def layout(): value=st.session_state['defaults'].txt2img.sampling_steps.min_value, help="Set the default minimum value for the sampling steps slider. Default is: 1") - st.session_state["defaults"].txt2img.sampling_steps.step = st.text_input("Sampling Slider Steps", + st.session_state["defaults"].txt2img.sampling_steps.step = st.number_input("Sampling Slider Steps", value=st.session_state['defaults'].txt2img.sampling_steps.step, help="Set the default value for the number of steps on the sampling steps slider. Default is: 10") @@ -827,4 +827,4 @@ def layout(): toml.dump(st.session_state["streamlit_config"], toml_file) if reset_button: - st.session_state["defaults"] = OmegaConf.load("configs/webui/webui_streamlit.yaml") \ No newline at end of file + st.session_state["defaults"] = OmegaConf.load("configs/webui/webui_streamlit.yaml") From 0a54ea679121472073c3c1384dc204f5045ec04e Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Thu, 6 Oct 2022 23:32:54 -0700 Subject: [PATCH 05/14] Changed yaml.load() for yaml.safe_load() in daisi_app.py --- daisi_app.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/daisi_app.py b/daisi_app.py index 81293fd..5f66521 100644 --- a/daisi_app.py +++ b/daisi_app.py @@ -5,14 +5,14 @@ print (os.getcwd) try: with open("environment.yaml") as file_handle: - environment_data = yaml.load(file_handle, Loader=yaml.FullLoader) + environment_data = yaml.safe_load(file_handle, Loader=yaml.FullLoader) except FileNotFoundError: try: with open(os.path.join("..", "environment.yaml")) as file_handle: - environment_data = yaml.load(file_handle, Loader=yaml.FullLoader) + environment_data = yaml.safe_load(file_handle, Loader=yaml.FullLoader) except: pass - + try: for dependency in environment_data["dependencies"]: package_name, package_version = dependency.split("=") @@ -21,6 +21,6 @@ except: pass try: - subprocess.run(['python', '-m', 'streamlit', "run" ,os.path.join("..","scripts/webui_streamlit.py"), "--theme.base dark"], stdout=subprocess.DEVNULL) + subprocess.run(['python', '-m', 'streamlit', "run" ,os.path.join("..","scripts/webui_streamlit.py"), "--theme.base dark"], stdout=subprocess.DEVNULL) except FileExistsError: - subprocess.run(['python', '-m', 'streamlit', "run" ,"scripts/webui_streamlit.py", "--theme.base dark"], stdout=subprocess.DEVNULL) \ No newline at end of file + subprocess.run(['python', '-m', 'streamlit', "run" ,"scripts/webui_streamlit.py", "--theme.base dark"], stdout=subprocess.DEVNULL) \ No newline at end of file From e95385e27e546ab1dbed18ca2afd9394ceda3600 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Thu, 6 Oct 2022 23:45:55 -0700 Subject: [PATCH 06/14] Fixed use of multiple spaces after some operators. --- scripts/sd_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sd_utils.py b/scripts/sd_utils.py index 2b12425..6460006 100644 --- a/scripts/sd_utils.py +++ b/scripts/sd_utils.py @@ -588,7 +588,7 @@ def get_matched_noise(_np_src_image, np_mask_rgb, noise_q, color_variation): noise_window = _get_gaussian_window(width, height, mode=1) # start with simple gaussian noise noise_rgb = np.random.random_sample((width, height, num_channels)) noise_grey = (np.sum(noise_rgb, axis=2)/3.) - noise_rgb *= color_variation # the colorfulness of the starting noise is blended to greyscale with a parameter + noise_rgb *= color_variation # the colorfulness of the starting noise is blended to greyscale with a parameter for c in range(num_channels): noise_rgb[:,:,c] += (1. - color_variation) * noise_grey @@ -2471,7 +2471,7 @@ def process_images( else: grid = image_grid(output_images, batch_size) - if grid and (batch_size > 1 or n_iter > 1): + if grid and (batch_size > 1 or n_iter > 1): output_images.insert(0, grid) grid_count = get_next_sequence_number(outpath, 'grid-') From d9b49144066d9b6d8fe8b4213f830c5c6d451749 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 12:40:06 -0700 Subject: [PATCH 07/14] Moved the pip dependencies from the environment.yaml to the requirements.txt file. --- environment.yaml | 50 +------------------------------------------- requirements.txt | 54 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 67 deletions(-) diff --git a/environment.yaml b/environment.yaml index b966e4a..f629219 100644 --- a/environment.yaml +++ b/environment.yaml @@ -29,53 +29,5 @@ dependencies: - scikit-image=0.19.2 - torchvision=0.12.0 - pip: - - -e . - - -e git+https://github.com/CompVis/taming-transformers#egg=taming-transformers - - -e git+https://github.com/openai/CLIP#egg=clip - - -e git+https://github.com/hlky/k-diffusion-sd#egg=k_diffusion - - -e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion - - accelerate==0.12.0 - - albumentations==0.4.3 - - basicsr>=1.3.4.0 - - diffusers==0.3.0 - - einops==0.3.1 - - facexlib>=0.2.3 - - ftfy==6.1.1 - - fairscale==0.4.4 - - gradio==3.1.6 - - gfpgan==1.3.8 - - hydralit_components==1.0.10 - - hydralit==1.0.14 - - imageio-ffmpeg==0.4.2 - - imageio==2.9.0 - - kornia==0.6 - - loguru - - omegaconf==2.1.1 - - opencv-python-headless==4.6.0.66 - - open-clip-torch==2.0.2 - - pandas==1.4.3 - - piexif==1.1.3 - - pudb==2019.2 - - pynvml==11.4.1 - - python-slugify>=6.1.2 - - pytorch-lightning==1.4.2 - - retry>=0.9.2 - - regex - - realesrgan==0.3.0 - - streamlit==1.13.0 - - streamlit-on-Hover-tabs==1.0.1 - - streamlit-option-menu==0.3.2 - - streamlit_nested_layout - - streamlit-server-state==0.14.2 - - streamlit-tensorboard==0.0.2 - - test-tube>=0.7.5 - - tensorboard==2.10.1 - - timm==0.6.7 - - torch-fidelity==0.3.0 - - torchmetrics==0.6.0 - - transformers==4.19.2 - - tensorflow==2.10.0 - - tqdm==4.64.0 - - stqdm==0.0.4 - - wget + - -r requirements.txt diff --git a/requirements.txt b/requirements.txt index c4dc50e..d8708a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,28 @@ -transformers==4.19.2 # do not change -diffusers==0.3.0 -invisible-watermark==0.1.5 -pytorch_lightning==1.7.7 -open-clip-torch -loguru -taming-transformers-rom1504==0.0.6 # required by ldm -wget +-e . + # See: https://github.com/CompVis/taming-transformers/issues/176 # -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers # required by ldm # Note: taming package needs to be installed with -e option +-e git+https://github.com/CompVis/taming-transformers#egg=taming-transformers +invisible-watermark==0.1.5 +taming-transformers-rom1504==0.0.6 # required by ldm +# Note: K-diffusion brings in CLIP 1.0 as a dependency automatically; will create a dependency resolution conflict when explicitly specified together +git+https://github.com/openai/CLIP.git@main#egg=clip git+https://github.com/crowsonkb/k-diffusion.git -# Note: K-diffusion brings in CLIP 1.0 as a dependency automatically; will create a dependency resolution conflict when explicitly specified together -# git+https://github.com/openai/CLIP.git@main#egg=clip +# git+https://github.com/hlky/k-diffusion-sd#egg=k_diffusion # Dependencies required for Stable Diffusion UI pynvml==11.4.1 omegaconf==2.2.3 -Jinja2==3.1.2 # Jinja2 is required by Gradio # Note: Jinja2 3.x major version required due to breaking changes found in markupsafe==2.1.1; 2.0.1 is incompatible with other upstream dependencies # see https://github.com/pallets/markupsafe/issues/304 - +Jinja2==3.1.2 # Jinja2 is required by Gradio # Environment Dependencies for WebUI (gradio) -gradio==3.4 +gradio==3.1.6 # Environment Dependencies for WebUI (streamlit) streamlit==1.13.0 @@ -36,6 +33,7 @@ streamlit-server-state==0.14.2 streamlit-tensorboard==0.0.2 hydralit==1.0.14 hydralit_components==1.0.10 +stqdm==0.0.4 # Img2text ftfy==6.1.1 @@ -47,9 +45,30 @@ tensorboard==2.10.1 # Other -retry==0.9.2 # used by sdutils -python-slugify==6.1.2 # used by sdutils -piexif==1.1.3 # used by sdutils +retry==0.9.2 # used by sd_utils +python-slugify==6.1.2 # used by sd_utils +piexif==1.1.3 # used by sd_utils + +accelerate==0.12.0 +albumentations==0.4.3 +diffusers==0.3.0 +einops==0.3.1 +facexlib>=0.2.3 +imageio-ffmpeg==0.4.2 +imageio==2.9.0 +kornia==0.6 +loguru +opencv-python-headless==4.6.0.66 +open-clip-torch==2.0.2 +pandas==1.4.3 +pudb==2019.2 +pytorch-lightning==1.7.7 +realesrgan==0.3.0 +test-tube>=0.7.5 +timm==0.6.7 +torch-fidelity==0.3.0 +transformers==4.19.2 # do not change +wget # Optional packages commonly used with Stable Diffusion workflow @@ -57,11 +76,10 @@ piexif==1.1.3 # used by sdutils basicsr==1.4.2 # required by RealESRGAN gfpgan==1.3.8 # GFPGAN realesrgan==0.3.0 # RealESRGAN brings in GFPGAN as a requirement --e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion #ldsr +git+https://github.com/CompVis/latent-diffusion ## for monocular depth estimation tensorflow==2.10.0 - # Orphaned Packages: No usage found From 45976077c0d724b7459fa8d563a5ee208081b2b5 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 16:59:29 -0700 Subject: [PATCH 08/14] Changed the image preview frequency field to only allow positive values. --- scripts/Settings.py | 18 +++++++++++------- scripts/img2img.py | 8 +++++--- scripts/txt2img.py | 8 +++++--- scripts/txt2vid.py | 10 ++++++---- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/scripts/Settings.py b/scripts/Settings.py index 8730f69..eab7a43 100644 --- a/scripts/Settings.py +++ b/scripts/Settings.py @@ -157,9 +157,11 @@ def layout(): #in steps will be shown, this is helpful to reduce the negative effect this option has on performance. \ #Default: True") st.session_state["defaults"].general.update_preview = True - st.session_state["defaults"].general.update_preview_frequency = st.number_input("Update Preview Frequency", value=st.session_state['defaults'].general.update_preview_frequency, - help="Specify the frequency at which the image is updated in steps, this is helpful to reduce the \ - negative effect updating the preview image has on performance. Default: 10") + st.session_state["defaults"].general.update_preview_frequency = st.number_input("Update Preview Frequency", + min_value=1, + value=st.session_state['defaults'].general.update_preview_frequency, + help="Specify the frequency at which the image is updated in steps, this is helpful to reduce the \ + negative effect updating the preview image has on performance. Default: 10") with col3: st.title("Others") @@ -326,8 +328,9 @@ def layout(): st.session_state["defaults"].txt2img.update_preview = True st.session_state["defaults"].txt2img.update_preview_frequency = st.number_input("Preview Image Update Frequency", - value=st.session_state['defaults'].txt2img.update_preview_frequency, - help="Set the default value for the frrquency of the preview image updates. Default is: 10") + min_value=1, + value=st.session_state['defaults'].txt2img.update_preview_frequency, + help="Set the default value for the frrquency of the preview image updates. Default is: 10") with col5: st.title("Variation Parameters") @@ -526,8 +529,9 @@ def layout(): st.session_state["defaults"].img2img.update_preview = True st.session_state["defaults"].img2img.update_preview_frequency = st.number_input("Img2Img Preview Image Update Frequency", - value=st.session_state['defaults'].img2img.update_preview_frequency, - help="Set the default value for the frrquency of the preview image updates. Default is: 10") + min_value=1, + value=st.session_state['defaults'].img2img.update_preview_frequency, + help="Set the default value for the frrquency of the preview image updates. Default is: 10") st.title("Variation Parameters") diff --git a/scripts/img2img.py b/scripts/img2img.py index 6241267..4eca0e0 100644 --- a/scripts/img2img.py +++ b/scripts/img2img.py @@ -446,9 +446,11 @@ def layout(): with st.expander("Preview Settings"): st.session_state["update_preview"] = st.session_state["defaults"].general.update_preview - st.session_state["update_preview_frequency"] = st.text_input("Update Image Preview Frequency", value=st.session_state['defaults'].img2img.update_preview_frequency, - help="Frequency in steps at which the the preview image is updated. By default the frequency \ - is set to 1 step.") + st.session_state["update_preview_frequency"] = st.number_input("Update Image Preview Frequency", + min_value=1, + value=st.session_state['defaults'].img2img.update_preview_frequency, + help="Frequency in steps at which the the preview image is updated. By default the frequency \ + is set to 1 step.") # with st.expander("Advanced"): with st.expander("Output Settings"): diff --git a/scripts/txt2img.py b/scripts/txt2img.py index 283245d..105788a 100644 --- a/scripts/txt2img.py +++ b/scripts/txt2img.py @@ -222,9 +222,11 @@ def layout(): with st.expander("Preview Settings"): st.session_state["update_preview"] = st.session_state["defaults"].general.update_preview - st.session_state["update_preview_frequency"] = st.text_input("Update Image Preview Frequency", value=st.session_state['defaults'].txt2img.update_preview_frequency, - help="Frequency in steps at which the the preview image is updated. By default the frequency \ - is set to 10 step.") + st.session_state["update_preview_frequency"] = st.number_input("Update Image Preview Frequency", + min_value=1, + value=st.session_state['defaults'].txt2img.update_preview_frequency, + help="Frequency in steps at which the the preview image is updated. By default the frequency \ + is set to 10 step.") with col2: preview_tab, gallery_tab = st.tabs(["Preview", "Gallery"]) diff --git a/scripts/txt2vid.py b/scripts/txt2vid.py index 773b87e..ec38208 100644 --- a/scripts/txt2vid.py +++ b/scripts/txt2vid.py @@ -155,7 +155,7 @@ def diffuse( st.session_state["previous_chunk_speed_list"], st.session_state['defaults'].txt2vid.update_preview_frequency, st.session_state["update_preview_frequency_list"]) - + #scale and decode the image latents with vae cond_latents_2 = 1 / 0.18215 * cond_latents image = pipe.vae.decode(cond_latents_2) @@ -613,9 +613,11 @@ def layout(): #By default this is enabled and the frequency is set to 1 step.") st.session_state["update_preview"] = st.session_state["defaults"].general.update_preview - st.session_state["update_preview_frequency"] = st.text_input("Update Image Preview Frequency", value=st.session_state['defaults'].txt2vid.update_preview_frequency, - help="Frequency in steps at which the the preview image is updated. By default the frequency \ - is set to 1 step.") + st.session_state["update_preview_frequency"] = st.number_input("Update Image Preview Frequency", + min_value=1, + value=st.session_state['defaults'].txt2vid.update_preview_frequency, + help="Frequency in steps at which the the preview image is updated. By default the frequency \ + is set to 1 step.") # From 55b39b53bce83ad0afccd6e22dd578e3a55992e1 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 17:57:06 -0700 Subject: [PATCH 09/14] Changed some more text_input to number_input and made sure they return the right value type. --- scripts/Settings.py | 4 ++-- scripts/img2img.py | 20 ++++++++++---------- scripts/txt2img.py | 20 ++++++++++---------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/scripts/Settings.py b/scripts/Settings.py index eab7a43..eab0f57 100644 --- a/scripts/Settings.py +++ b/scripts/Settings.py @@ -754,7 +754,7 @@ def layout(): st.session_state["defaults"].txt2vid.beta_start.step = st.number_input("txt2vid Beta Start Slider Steps", value=st.session_state['defaults'].txt2vid.beta_start.step, help="Set the default value for the number of steps on the variation slider. Default is: 1") - st.session_state["defaults"].txt2vid.beta_start.format = st.text_input("Default txt2vid Beta Start Format", value=st.session_state['defaults'].txt2vid.beta_start.format, + st.session_state["defaults"].txt2vid.beta_start.format = st.number_input("Default txt2vid Beta Start Format", value=st.session_state['defaults'].txt2vid.beta_start.format, help="Set the default Beta Start Format. Default is: %.5\f") # Beta End @@ -770,7 +770,7 @@ def layout(): st.session_state["defaults"].txt2vid.beta_end.step = st.number_input("txt2vid Beta End Slider Steps", value=st.session_state['defaults'].txt2vid.beta_end.step, help="Set the default value for the number of steps on the variation slider. Default is: 1") - st.session_state["defaults"].txt2vid.beta_end.format = st.text_input("Default txt2vid Beta End Format", value=st.session_state['defaults'].txt2vid.beta_start.format, + st.session_state["defaults"].txt2vid.beta_end.format = st.number_input("Default txt2vid Beta End Format", value=st.session_state['defaults'].txt2vid.beta_start.format, help="Set the default Beta Start Format. Default is: %.5\f") with image_processing: diff --git a/scripts/img2img.py b/scripts/img2img.py index 4eca0e0..75c801c 100644 --- a/scripts/img2img.py +++ b/scripts/img2img.py @@ -436,13 +436,13 @@ def layout(): step=st.session_state['defaults'].img2img.find_noise_steps.step) with st.expander("Batch Options"): - st.session_state["batch_count"] = int(st.text_input("Batch count.", value=st.session_state['defaults'].img2img.batch_count.value, - help="How many iterations or batches of images to generate in total.")) + st.session_state["batch_count"] = st.number_input("Batch count.", value=st.session_state['defaults'].img2img.batch_count.value, + help="How many iterations or batches of images to generate in total.") - st.session_state["batch_size"] = int(st.text_input("Batch size", value=st.session_state.defaults.img2img.batch_size.value, + st.session_state["batch_size"] = st.number_input("Batch size", value=st.session_state.defaults.img2img.batch_size.value, help="How many images are at once in a batch.\ It increases the VRAM usage a lot but if you have enough VRAM it can reduce the time it takes to finish generation as more images are generated at once.\ - Default: 1")) + Default: 1") with st.expander("Preview Settings"): st.session_state["update_preview"] = st.session_state["defaults"].general.update_preview @@ -546,14 +546,14 @@ def layout(): st.session_state["LDSR_model"] = st.selectbox("LDSR model", st.session_state["LDSR_models"], index=st.session_state["LDSR_models"].index(st.session_state['defaults'].general.LDSR_model)) - st.session_state["ldsr_sampling_steps"] = int(st.text_input("Sampling Steps", value=st.session_state['defaults'].img2img.LDSR_config.sampling_steps, - help="")) + st.session_state["ldsr_sampling_steps"] = st.number_input("Sampling Steps", value=st.session_state['defaults'].img2img.LDSR_config.sampling_steps, + help="") - st.session_state["preDownScale"] = int(st.text_input("PreDownScale", value=st.session_state['defaults'].img2img.LDSR_config.preDownScale, - help="")) + st.session_state["preDownScale"] = st.number_input("PreDownScale", value=st.session_state['defaults'].img2img.LDSR_config.preDownScale, + help="") - st.session_state["postDownScale"] = int(st.text_input("postDownScale", value=st.session_state['defaults'].img2img.LDSR_config.postDownScale, - help="")) + st.session_state["postDownScale"] = st.number_input("postDownScale", value=st.session_state['defaults'].img2img.LDSR_config.postDownScale, + help="") downsample_method_list = ['Nearest', 'Lanczos'] st.session_state["downsample_method"] = st.selectbox("Downsample Method", downsample_method_list, diff --git a/scripts/txt2img.py b/scripts/txt2img.py index 105788a..79a2cf1 100644 --- a/scripts/txt2img.py +++ b/scripts/txt2img.py @@ -210,14 +210,14 @@ def layout(): #It increases the VRAM usage a lot but if you have enough VRAM it can reduce the time it takes to finish generation as more images are generated at once.\ #Default: 1") - st.session_state["batch_count"] = int(st.text_input("Batch count.", value=st.session_state['defaults'].txt2img.batch_count.value, - help="How many iterations or batches of images to generate in total.")) + st.session_state["batch_count"] = st.number_input("Batch count.", value=st.session_state['defaults'].txt2img.batch_count.value, + help="How many iterations or batches of images to generate in total.") - st.session_state["batch_size"] = int(st.text_input("Batch size", value=st.session_state.defaults.txt2img.batch_size.value, + st.session_state["batch_size"] = st.number_input("Batch size", value=st.session_state.defaults.txt2img.batch_size.value, help="How many images are at once in a batch.\ It increases the VRAM usage a lot but if you have enough VRAM it can reduce the time it takes \ to finish generation as more images are generated at once.\ - Default: 1") ) + Default: 1") with st.expander("Preview Settings"): @@ -368,14 +368,14 @@ def layout(): st.session_state["LDSR_model"] = st.selectbox("LDSR model", st.session_state["LDSR_models"], index=st.session_state["LDSR_models"].index(st.session_state['defaults'].general.LDSR_model)) - st.session_state["ldsr_sampling_steps"] = int(st.text_input("Sampling Steps", value=st.session_state['defaults'].txt2img.LDSR_config.sampling_steps, - help="")) + st.session_state["ldsr_sampling_steps"] = st.number_input("Sampling Steps", value=st.session_state['defaults'].txt2img.LDSR_config.sampling_steps, + help="") - st.session_state["preDownScale"] = int(st.text_input("PreDownScale", value=st.session_state['defaults'].txt2img.LDSR_config.preDownScale, - help="")) + st.session_state["preDownScale"] = st.number_input("PreDownScale", value=st.session_state['defaults'].txt2img.LDSR_config.preDownScale, + help="") - st.session_state["postDownScale"] = int(st.text_input("postDownScale", value=st.session_state['defaults'].txt2img.LDSR_config.postDownScale, - help="")) + st.session_state["postDownScale"] = st.number_input("postDownScale", value=st.session_state['defaults'].txt2img.LDSR_config.postDownScale, + help="") downsample_method_list = ['Nearest', 'Lanczos'] st.session_state["downsample_method"] = st.selectbox("Downsample Method", downsample_method_list, From ac2d348f04245ee654e08d0c3ee218ee665a36a0 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 20:32:02 -0700 Subject: [PATCH 10/14] Fixed "expected scalar type Half but found Float" on txt2vid. --- scripts/txt2vid.py | 97 +++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/scripts/txt2vid.py b/scripts/txt2vid.py index ec38208..569e007 100644 --- a/scripts/txt2vid.py +++ b/scripts/txt2vid.py @@ -155,7 +155,7 @@ def diffuse( st.session_state["previous_chunk_speed_list"], st.session_state['defaults'].txt2vid.update_preview_frequency, st.session_state["update_preview_frequency_list"]) - + #scale and decode the image latents with vae cond_latents_2 = 1 / 0.18215 * cond_latents image = pipe.vae.decode(cond_latents_2) @@ -266,31 +266,31 @@ def load_diffusers_model(weights_path,torch_device): # def txt2vid( # -------------------------------------- - # args you probably want to change + # args you probably want to change prompts = ["blueberry spaghetti", "strawberry spaghetti"], # prompt to dream about gpu:int = st.session_state['defaults'].general.gpu, # id of the gpu to run on #name:str = 'test', # name of this project, for the output directory #rootdir:str = st.session_state['defaults'].general.outdir, num_steps:int = 200, # number of steps between each pair of sampled points - max_frames:int = 10000, # number of frames to write and then exit the script - num_inference_steps:int = 50, # more (e.g. 100, 200 etc) can create slightly better images - cfg_scale:float = 5.0, # can depend on the prompt. usually somewhere between 3-10 is good - do_loop = False, - use_lerp_for_text = False, - seeds = None, - quality:int = 100, # for jpeg compression of the output images - eta:float = 0.0, - width:int = 256, - height:int = 256, - weights_path = "CompVis/stable-diffusion-v1-4", - scheduler="klms", # choices: default, ddim, klms - disable_tqdm = False, - #----------------------------------------------- - beta_start = 0.0001, - beta_end = 0.00012, - beta_schedule = "scaled_linear", - starting_image=None - ): + max_frames:int = 10000, # number of frames to write and then exit the script + num_inference_steps:int = 50, # more (e.g. 100, 200 etc) can create slightly better images + cfg_scale:float = 5.0, # can depend on the prompt. usually somewhere between 3-10 is good + do_loop = False, + use_lerp_for_text = False, + seeds = None, + quality:int = 100, # for jpeg compression of the output images + eta:float = 0.0, + width:int = 256, + height:int = 256, + weights_path = "CompVis/stable-diffusion-v1-4", + scheduler="klms", # choices: default, ddim, klms + disable_tqdm = False, + #----------------------------------------------- + beta_start = 0.0001, + beta_end = 0.00012, + beta_schedule = "scaled_linear", + starting_image=None + ): """ prompt = ["blueberry spaghetti", "strawberry spaghetti"], # prompt to dream about gpu:int = st.session_state['defaults'].general.gpu, # id of the gpu to run on @@ -344,29 +344,29 @@ def txt2vid( if st.session_state.write_info_files: with open(os.path.join(full_path , f'{slugify(str(seeds))}_config.json' if len(prompts) > 1 else "prompts_config.json"), "w") as outfile: outfile.write(json.dumps( - dict( - prompts = prompts, - gpu = gpu, - num_steps = num_steps, - max_frames = max_frames, - num_inference_steps = num_inference_steps, - cfg_scale = cfg_scale, - do_loop = do_loop, - use_lerp_for_text = use_lerp_for_text, - seeds = seeds, - quality = quality, - eta = eta, - width = width, - height = height, - weights_path = weights_path, - scheduler=scheduler, - disable_tqdm = disable_tqdm, - beta_start = beta_start, - beta_end = beta_end, - beta_schedule = beta_schedule - ), - indent=2, - sort_keys=False, + dict( + prompts = prompts, + gpu = gpu, + num_steps = num_steps, + max_frames = max_frames, + num_inference_steps = num_inference_steps, + cfg_scale = cfg_scale, + do_loop = do_loop, + use_lerp_for_text = use_lerp_for_text, + seeds = seeds, + quality = quality, + eta = eta, + width = width, + height = height, + weights_path = weights_path, + scheduler=scheduler, + disable_tqdm = disable_tqdm, + beta_start = beta_start, + beta_end = beta_end, + beta_schedule = beta_schedule + ), + indent=2, + sort_keys=False, )) #print(scheduler) @@ -413,9 +413,10 @@ def txt2vid( #prompts.append(prompts) #seeds.append(first_seed) - # get the conditional text embeddings based on the prompt - text_input = server_state["pipe"].tokenizer(prompts, padding="max_length", max_length=server_state["pipe"].tokenizer.model_max_length, truncation=True, return_tensors="pt") - cond_embeddings = server_state["pipe"].text_encoder(text_input.input_ids.to(torch_device))[0] # shape [1, 77, 768] + with torch.autocast('cuda'): + # get the conditional text embeddings based on the prompt + text_input = server_state["pipe"].tokenizer(prompts, padding="max_length", max_length=server_state["pipe"].tokenizer.model_max_length, truncation=True, return_tensors="pt") + cond_embeddings = server_state["pipe"].text_encoder(text_input.input_ids.to(torch_device) )[0] # if st.session_state.defaults.general.use_sd_concepts_library: @@ -604,7 +605,7 @@ def layout(): #It increases the VRAM usage a lot but if you have enough VRAM it can reduce the time it takes to finish generation as more images are generated at once.\ #Default: 1") - st.session_state["max_frames"] = int(st.text_input("Max Frames:", value=st.session_state['defaults'].txt2vid.max_frames, help="Specify the max number of frames you want to generate.")) + st.session_state["max_frames"] = st.number_input("Max Frames:", value=st.session_state['defaults'].txt2vid.max_frames, help="Specify the max number of frames you want to generate.") with st.expander("Preview Settings"): #st.session_state["update_preview"] = st.checkbox("Update Image Preview", value=st.session_state['defaults'].txt2vid.update_preview, From 97bbf089ea8b0de733820062eecc72aa53b11fc6 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 23:43:18 -0700 Subject: [PATCH 11/14] Added save_video_on_stop option to config, this option is used to save the txt2vid generated video to disk when we hit the stop button mid generation. --- configs/webui/webui_streamlit.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/webui/webui_streamlit.yaml b/configs/webui/webui_streamlit.yaml index f826b4d..92dc639 100644 --- a/configs/webui/webui_streamlit.yaml +++ b/configs/webui/webui_streamlit.yaml @@ -175,6 +175,7 @@ txt2vid: normalize_prompt_weights: True save_individual_images: True save_video: True + save_video_on_stop: False group_by_prompt: True write_info_files: True do_loop: False From 63b2ff22c6664e01d38a551dcd4e6fcce344f026 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 23:46:23 -0700 Subject: [PATCH 12/14] Added the ability to save the video during mid generation when we hit the stop button. - Fixed GFPGAN not working on txt2vid. --- scripts/txt2vid.py | 221 ++++++++++++++++++++++++++------------------- 1 file changed, 129 insertions(+), 92 deletions(-) diff --git a/scripts/txt2vid.py b/scripts/txt2vid.py index 569e007..2428dd0 100644 --- a/scripts/txt2vid.py +++ b/scripts/txt2vid.py @@ -113,88 +113,93 @@ def diffuse( if "update_preview_frequency_list" not in st.session_state: st.session_state["update_preview_frequency_list"] = [0] - st.session_state["update_preview_frequency_list"].append(st.session_state['defaults'].txt2vid.update_preview_frequency) + st.session_state["update_preview_frequency_list"].append(st.session_state["update_preview_frequency"]) - # diffuse! - for i, t in enumerate(pipe.scheduler.timesteps): - start = timeit.default_timer() + try: + # diffuse! + for i, t in enumerate(pipe.scheduler.timesteps): + start = timeit.default_timer() - #status_text.text(f"Running step: {step_counter}{total_number_steps} {percent} | {duration:.2f}{speed}") + #status_text.text(f"Running step: {step_counter}{total_number_steps} {percent} | {duration:.2f}{speed}") - # expand the latents for classifier free guidance - latent_model_input = torch.cat([cond_latents] * 2) - if isinstance(pipe.scheduler, LMSDiscreteScheduler): - sigma = pipe.scheduler.sigmas[i] - latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) + # expand the latents for classifier free guidance + latent_model_input = torch.cat([cond_latents] * 2) + if isinstance(pipe.scheduler, LMSDiscreteScheduler): + sigma = pipe.scheduler.sigmas[i] + latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) - # predict the noise residual - noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] + # predict the noise residual + noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] - # cfg - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + cfg_scale * (noise_pred_text - noise_pred_uncond) + # cfg + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + cfg_scale * (noise_pred_text - noise_pred_uncond) - # compute the previous noisy sample x_t -> x_t-1 - if isinstance(pipe.scheduler, LMSDiscreteScheduler): - cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"] - else: - cond_latents = pipe.scheduler.step(noise_pred, t, cond_latents, **extra_step_kwargs)["prev_sample"] + # compute the previous noisy sample x_t -> x_t-1 + if isinstance(pipe.scheduler, LMSDiscreteScheduler): + cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"] + else: + cond_latents = pipe.scheduler.step(noise_pred, t, cond_latents, **extra_step_kwargs)["prev_sample"] - #print (st.session_state["update_preview_frequency"]) - #update the preview image if it is enabled and the frequency matches the step_counter - if st.session_state['defaults'].txt2vid.update_preview: - step_counter += 1 - if st.session_state['defaults'].txt2vid.update_preview_frequency == step_counter or step_counter == st.session_state.sampling_steps: - if st.session_state.dynamic_preview_frequency: - st.session_state["current_chunk_speed"], - st.session_state["previous_chunk_speed_list"], - st.session_state['defaults'].txt2vid.update_preview_frequency, - st.session_state["avg_update_preview_frequency"] = optimize_update_preview_frequency(st.session_state["current_chunk_speed"], - st.session_state["previous_chunk_speed_list"], - st.session_state['defaults'].txt2vid.update_preview_frequency, - st.session_state["update_preview_frequency_list"]) + #update the preview image if it is enabled and the frequency matches the step_counter + if st.session_state["update_preview"]: + step_counter += 1 - #scale and decode the image latents with vae - cond_latents_2 = 1 / 0.18215 * cond_latents - image = pipe.vae.decode(cond_latents_2) + if st.session_state["update_preview_frequency"] == step_counter or step_counter == st.session_state.sampling_steps: + if st.session_state.dynamic_preview_frequency: + st.session_state["current_chunk_speed"], + st.session_state["previous_chunk_speed_list"], + st.session_state["update_preview_frequency"], + st.session_state["avg_update_preview_frequency"] = optimize_update_preview_frequency(st.session_state["current_chunk_speed"], + st.session_state["previous_chunk_speed_list"], + st.session_state["update_preview_frequency"], + st.session_state["update_preview_frequency_list"]) - # generate output numpy image as uint8 - image = torch.clamp((image["sample"] + 1.0) / 2.0, min=0.0, max=1.0) - image2 = transforms.ToPILImage()(image.squeeze_(0)) + #scale and decode the image latents with vae + cond_latents_2 = 1 / 0.18215 * cond_latents + image = pipe.vae.decode(cond_latents_2) - st.session_state["preview_image"].image(image2) + # generate output numpy image as uint8 + image = torch.clamp((image["sample"] + 1.0) / 2.0, min=0.0, max=1.0) + image2 = transforms.ToPILImage()(image.squeeze_(0)) - step_counter = 0 + st.session_state["preview_image"].image(image2) - duration = timeit.default_timer() - start + step_counter = 0 - st.session_state["current_chunk_speed"] = duration + duration = timeit.default_timer() - start - if duration >= 1: - speed = "s/it" - else: - speed = "it/s" - duration = 1 / duration + st.session_state["current_chunk_speed"] = duration - if i > st.session_state.sampling_steps: - inference_counter += 1 - inference_percent = int(100 * float(inference_counter + 1 if inference_counter < num_inference_steps else num_inference_steps)/float(num_inference_steps)) - inference_progress = f"{inference_counter + 1 if inference_counter < num_inference_steps else num_inference_steps}/{num_inference_steps} {inference_percent}% " - else: - inference_progress = "" + if duration >= 1: + speed = "s/it" + else: + speed = "it/s" + duration = 1 / duration - percent = int(100 * float(i+1 if i+1 < st.session_state.sampling_steps else st.session_state.sampling_steps)/float(st.session_state.sampling_steps)) - frames_percent = int(100 * float(st.session_state.current_frame if st.session_state.current_frame < st.session_state.max_frames else st.session_state.max_frames)/float(st.session_state.max_frames)) + if i > st.session_state.sampling_steps: + inference_counter += 1 + inference_percent = int(100 * float(inference_counter + 1 if inference_counter < num_inference_steps else num_inference_steps)/float(num_inference_steps)) + inference_progress = f"{inference_counter + 1 if inference_counter < num_inference_steps else num_inference_steps}/{num_inference_steps} {inference_percent}% " + else: + inference_progress = "" - st.session_state["progress_bar_text"].text( - f"Running step: {i+1 if i+1 < st.session_state.sampling_steps else st.session_state.sampling_steps}/{st.session_state.sampling_steps} " - f"{percent if percent < 100 else 100}% {inference_progress}{duration:.2f}{speed} | " - f"Frame: {st.session_state.current_frame + 1 if st.session_state.current_frame < st.session_state.max_frames else st.session_state.max_frames}/{st.session_state.max_frames} " - f"{frames_percent if frames_percent < 100 else 100}% {st.session_state.frame_duration:.2f}{st.session_state.frame_speed}" - ) - st.session_state["progress_bar"].progress(percent if percent < 100 else 100) + percent = int(100 * float(i+1 if i+1 < st.session_state.sampling_steps else st.session_state.sampling_steps)/float(st.session_state.sampling_steps)) + frames_percent = int(100 * float(st.session_state.current_frame if st.session_state.current_frame < st.session_state.max_frames else st.session_state.max_frames)/float( + st.session_state.max_frames)) + + st.session_state["progress_bar_text"].text( + f"Running step: {i+1 if i+1 < st.session_state.sampling_steps else st.session_state.sampling_steps}/{st.session_state.sampling_steps} " + f"{percent if percent < 100 else 100}% {inference_progress}{duration:.2f}{speed} | " + f"Frame: {st.session_state.current_frame + 1 if st.session_state.current_frame < st.session_state.max_frames else st.session_state.max_frames}/{st.session_state.max_frames} " + f"{frames_percent if frames_percent < 100 else 100}% {st.session_state.frame_duration:.2f}{st.session_state.frame_speed}" + ) + st.session_state["progress_bar"].progress(percent if percent < 100 else 100) + + except KeyError: + raise StopException #scale and decode the image latents with vae cond_latents_2 = 1 / 0.18215 * cond_latents @@ -262,7 +267,23 @@ def load_diffusers_model(weights_path,torch_device): "You need a huggingface token in order to use the Text to Video tab. Use the Settings page from the sidebar on the left to add your token." ) raise OSError("You need a huggingface token in order to use the Text to Video tab. Use the Settings page from the sidebar on the left to add your token.") +# +def save_video_to_disk(frames, seeds, sanitized_prompt, fps=6,save_video=True, outdir='outputs'): + if save_video: + # write video to memory + #output = io.BytesIO() + #writer = imageio.get_writer(os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid"), im, extension=".mp4", fps=30) + #try: + video_path = os.path.join(os.getcwd(), outdir, "txt2vid",f"{seeds}_{sanitized_prompt}.mp4") + writer = imageio.get_writer(video_path, fps=fps) + for frame in frames: + writer.append_data(frame) + writer.close() + #except: + # print("Can't save video, skipping.") + + return video_path # def txt2vid( # -------------------------------------- @@ -275,6 +296,9 @@ def txt2vid( max_frames:int = 10000, # number of frames to write and then exit the script num_inference_steps:int = 50, # more (e.g. 100, 200 etc) can create slightly better images cfg_scale:float = 5.0, # can depend on the prompt. usually somewhere between 3-10 is good + save_video = True, + save_video_on_stop = False, + outdir='outputs', do_loop = False, use_lerp_for_text = False, seeds = None, @@ -332,11 +356,11 @@ def txt2vid( # init the output dir sanitized_prompt = slugify(prompts) - full_path = os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid-samples", "samples", sanitized_prompt) + full_path = os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid", "samples", sanitized_prompt) if len(full_path) > 220: sanitized_prompt = sanitized_prompt[:220-len(full_path)] - full_path = os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid-samples", "samples", sanitized_prompt) + full_path = os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid", "samples", sanitized_prompt) os.makedirs(full_path, exist_ok=True) @@ -512,11 +536,12 @@ def txt2vid( #append the frames to the frames list so we can use them later. frames.append(np.asarray(gfpgan_image)) - - st.session_state["preview_image"].image(gfpgan_image) - #except AttributeError: + try: + st.session_state["preview_image"].image(gfpgan_image) + except KeyError: + print ("Cant get session_state, skipping image preview.") + #except (AttributeError, KeyError): #print("Cant perform GFPGAN, skipping.") - #pass #increase frame_index counter. frame_index += 1 @@ -536,23 +561,18 @@ def txt2vid( init1 = init2 + # save the video after the generation is done. + video_path = save_video_to_disk(frames, seeds, sanitized_prompt, save_video=save_video, outdir=outdir) + except StopException: - pass + if save_video_on_stop: + print ("Streamlit Stop Exception Received. Saving video") + video_path = save_video_to_disk(frames, seeds, sanitized_prompt, save_video=save_video, outdir=outdir) + else: + video_path = None - if st.session_state['save_video']: - # write video to memory - #output = io.BytesIO() - #writer = imageio.get_writer(os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid-samples"), im, extension=".mp4", fps=30) - try: - video_path = os.path.join(os.getcwd(), st.session_state['defaults'].general.outdir, "txt2vid-samples",f"{seeds}_{sanitized_prompt}.mp4") - writer = imageio.get_writer(video_path, fps=6) - for frame in frames: - writer.append_data(frame) - writer.close() - except: - print("Can't save video, skipping.") - + if video_path and "preview_video" in st.session_state: # show video preview on the UI st.session_state["preview_video"].video(open(video_path, 'rb').read()) @@ -620,6 +640,11 @@ def layout(): help="Frequency in steps at which the the preview image is updated. By default the frequency \ is set to 1 step.") + st.session_state["dynamic_preview_frequency"] = st.checkbox("Dynamic Preview Frequency", value=st.session_state['defaults'].txt2vid.dynamic_preview_frequency, + help="This option tries to find the best value at which we can update \ + the preview image during generation while minimizing the impact it has in performance. Default: True") + + # @@ -644,6 +669,7 @@ def layout(): #generate_video = st.empty() st.session_state["preview_video"] = st.empty() + preview_video = st.session_state["preview_video"] message = st.empty() @@ -702,19 +728,23 @@ def layout(): help="Separate multiple prompts using the `|` character, and get all combinations of them.") st.session_state["normalize_prompt_weights"] = st.checkbox("Normalize Prompt Weights.", value=st.session_state['defaults'].txt2vid.normalize_prompt_weights, help="Ensure the sum of all weights add up to 1.0") + st.session_state["save_individual_images"] = st.checkbox("Save individual images.", value=st.session_state['defaults'].txt2vid.save_individual_images, help="Save each image generated before any filter or enhancement is applied.") + st.session_state["save_video"] = st.checkbox("Save video",value=st.session_state['defaults'].txt2vid.save_video, help="Save a video with all the images generated as frames at the end of the generation.") + save_video_on_stop = st.checkbox("Save video on Stop",value=st.session_state['defaults'].txt2vid.save_video_on_stop, + help="Save a video with all the images generated as frames when we hit the stop button during a generation.") + st.session_state["group_by_prompt"] = st.checkbox("Group results by prompt", value=st.session_state['defaults'].txt2vid.group_by_prompt, - help="Saves all the images with the same prompt into the same folder. When using a prompt matrix each prompt combination will have its own folder.") + help="Saves all the images with the same prompt into the same folder. When using a prompt \ + matrix each prompt combination will have its own folder.") + st.session_state["write_info_files"] = st.checkbox("Write Info file", value=st.session_state['defaults'].txt2vid.write_info_files, help="Save a file next to the image with informartion about the generation.") - st.session_state["dynamic_preview_frequency"] = st.checkbox("Dynamic Preview Frequency", value=st.session_state['defaults'].txt2vid.dynamic_preview_frequency, - help="This option tries to find the best value at which we can update \ - the preview image during generation while minimizing the impact it has in performance. Default: True") st.session_state["do_loop"] = st.checkbox("Do Loop", value=st.session_state['defaults'].txt2vid.do_loop, help="Do loop") st.session_state["save_as_jpg"] = st.checkbox("Save samples as jpg", value=st.session_state['defaults'].txt2vid.save_as_jpg, help="Saves the images as jpg instead of png.") @@ -830,7 +860,7 @@ def layout(): #load_models(False, st.session_state["use_GFPGAN"], True, st.session_state["RealESRGAN_model"]) if st.session_state["use_GFPGAN"]: - if "GFPGAN" in st.session_state: + if "GFPGAN" in server_state: print("GFPGAN already loaded") else: with col2: @@ -838,28 +868,35 @@ def layout(): # Load GFPGAN if os.path.exists(st.session_state["defaults"].general.GFPGAN_dir): try: - server_state["GFPGAN"] = load_GFPGAN() + load_GFPGAN() print("Loaded GFPGAN") except Exception: import traceback print("Error loading GFPGAN:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) else: - if "GFPGAN" in st.session_state: + if "GFPGAN" in server_state: del server_state["GFPGAN"] #try: # run video generation video, seed, info, stats = txt2vid(prompts=prompt, gpu=st.session_state["defaults"].general.gpu, - num_steps=st.session_state.sampling_steps, max_frames=int(st.session_state.max_frames), + num_steps=st.session_state.sampling_steps, max_frames=st.session_state.max_frames, num_inference_steps=st.session_state.num_inference_steps, - cfg_scale=cfg_scale,do_loop=st.session_state["do_loop"], + cfg_scale=cfg_scale, save_video_on_stop=save_video_on_stop, + outdir=st.session_state["defaults"].general.outdir, + do_loop=st.session_state["do_loop"], seeds=seed, quality=100, eta=0.0, width=width, height=height, weights_path=custom_model, scheduler=scheduler_name, disable_tqdm=False, beta_start=st.session_state['defaults'].txt2vid.beta_start.value, beta_end=st.session_state['defaults'].txt2vid.beta_end.value, beta_schedule=beta_scheduler_type, starting_image=None) + if video and save_video_on_stop: + # show video preview on the UI after we hit the stop button + # currently not working as session_state is cleared on StopException + preview_video.video(open(video, 'rb').read()) + #message.success('Done!', icon="✅") message.success('Render Complete: ' + info + '; Stats: ' + stats, icon="✅") From fe6e72fde79214aa848d47afe74487d9855b8643 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Fri, 7 Oct 2022 23:46:52 -0700 Subject: [PATCH 13/14] Added save_video_on_stop option to the Settings page, this option is used to save the txt2vid generated video to disk when we hit the stop button mid generation. --- requirements.txt | 2 +- scripts/Settings.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d8708a5..1c2f68e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,6 +34,7 @@ streamlit-tensorboard==0.0.2 hydralit==1.0.14 hydralit_components==1.0.10 stqdm==0.0.4 +diffusers==0.4.1 # Img2text ftfy==6.1.1 @@ -51,7 +52,6 @@ piexif==1.1.3 # used by sd_utils accelerate==0.12.0 albumentations==0.4.3 -diffusers==0.3.0 einops==0.3.1 facexlib>=0.2.3 imageio-ffmpeg==0.4.2 diff --git a/scripts/Settings.py b/scripts/Settings.py index eab0f57..a8bfe69 100644 --- a/scripts/Settings.py +++ b/scripts/Settings.py @@ -685,6 +685,10 @@ def layout(): st.session_state["defaults"].txt2vid.save_video = st.checkbox("Save Txt2Vid Video", value=st.session_state['defaults'].txt2vid.save_video, help="Choose to save the Txt2Vid video. Default: True") + st.session_state["defaults"].txt2vid.save_video_on_stop = st.checkbox("Save video on Stop",value=st.session_state['defaults'].txt2vid.save_video_on_stop, + help="Save a video with all the images generated as frames when we hit the stop button \ + during a generation.") + st.session_state["defaults"].txt2vid.group_by_prompt = st.checkbox("Group By txt2vid Prompt", value=st.session_state['defaults'].txt2vid.group_by_prompt, help="Choose to save images grouped by their prompt. Default: False") From 84479d88b88eb07b9d164a96b313eb3f683818e5 Mon Sep 17 00:00:00 2001 From: ZeroCool940711 Date: Sun, 9 Oct 2022 03:21:06 -0700 Subject: [PATCH 14/14] Changed the prompt text_input for a text_area and made it similar in size to the text input. --- frontend/css/streamlit.main.css | 8 +++++++- requirements.txt | 4 ++++ scripts/img2img.py | 2 +- scripts/txt2img.py | 2 +- scripts/txt2vid.py | 2 +- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/frontend/css/streamlit.main.css b/frontend/css/streamlit.main.css index 8bb5db7..fb6d5fe 100644 --- a/frontend/css/streamlit.main.css +++ b/frontend/css/streamlit.main.css @@ -143,4 +143,10 @@ div.gallery:hover { } .css-jn99sy { display: none - } \ No newline at end of file + } + +/* Make the text area widget have a similar height as the text input field*/ +.st-ex{ + height: 54px; + min-height: 25px; +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1c2f68e..7ab31cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -81,5 +81,9 @@ git+https://github.com/CompVis/latent-diffusion ## for monocular depth estimation tensorflow==2.10.0 +# Unused Packages: No current usage but will be used in the future. + + # Orphaned Packages: No usage found + diff --git a/scripts/img2img.py b/scripts/img2img.py index 75c801c..a0ddb3d 100644 --- a/scripts/img2img.py +++ b/scripts/img2img.py @@ -365,7 +365,7 @@ def layout(): img2img_input_col, img2img_generate_col = st.columns([10,1]) with img2img_input_col: #prompt = st.text_area("Input Text","") - prompt = st.text_input("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") + prompt = st.text_area("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") # Every form must have a submit button, the extra blank spaces is a temp way to align it with the input field. Needs to be done in CSS or some other way. img2img_generate_col.write("") diff --git a/scripts/txt2img.py b/scripts/txt2img.py index 79a2cf1..2b55b73 100644 --- a/scripts/txt2img.py +++ b/scripts/txt2img.py @@ -183,7 +183,7 @@ def layout(): with input_col1: #prompt = st.text_area("Input Text","") - prompt = st.text_input("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") + prompt = st.text_area("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") # creating the page layout using columns col1, col2, col3 = st.columns([1,2,1], gap="large") diff --git a/scripts/txt2vid.py b/scripts/txt2vid.py index 2428dd0..d2e5332 100644 --- a/scripts/txt2vid.py +++ b/scripts/txt2vid.py @@ -596,7 +596,7 @@ def layout(): input_col1, generate_col1 = st.columns([10,1]) with input_col1: #prompt = st.text_area("Input Text","") - prompt = st.text_input("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") + prompt = st.text_area("Input Text","", placeholder="A corgi wearing a top hat as an oil painting.") # Every form must have a submit button, the extra blank spaces is a temp way to align it with the input field. Needs to be done in CSS or some other way. generate_col1.write("")