Reverted txt2vid to use the StableDiffusionPipeline instead of StableDiffusionWalkPipeline when loading the model. (#1630)

2024-12-14 14:05:36 +03:00 · 2022-10-31 07:15:19 -07:00 · 2022-10-31 07:15:19 -07:00 · e13132c78c
commit e13132c78c
parent 1cb5ba74b2 2ea2606e83
3 changed files with 47 additions and 69 deletions
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@ -654,40 +654,6 @@ def layout():

                message.success('Render Complete: ' + info + '; Stats: ' + stats, icon="✅")

-            #history_tab,col1,col2,col3,PlaceHolder,col1_cont,col2_cont,col3_cont = st.session_state['historyTab']
-
-            #if 'latestImages' in st.session_state:
-                #for i in output_images:
-                    ##push the new image to the list of latest images and remove the oldest one
-                    ##remove the last index from the list\
-                    #st.session_state['latestImages'].pop()
-                    ##add the new image to the start of the list
-                    #st.session_state['latestImages'].insert(0, i)
-                #PlaceHolder.empty()
-                #with PlaceHolder.container():
-                    #col1, col2, col3 = st.columns(3)
-                    #col1_cont = st.container()
-                    #col2_cont = st.container()
-                    #col3_cont = st.container()
-                    #images = st.session_state['latestImages']
-                    #with col1_cont:
-                        #with col1:
-                            #[st.image(images[index]) for index in [0, 3, 6] if index < len(images)]
-                    #with col2_cont:
-                        #with col2:
-                            #[st.image(images[index]) for index in [1, 4, 7] if index < len(images)]
-                    #with col3_cont:
-                        #with col3:
-                            #[st.image(images[index]) for index in [2, 5, 8] if index < len(images)]
-                    #historyGallery = st.empty()
-
-                ## check if output_images length is the same as seeds length
-                #with gallery_tab:
-                    #st.markdown(createHTMLGallery(output_images,seeds), unsafe_allow_html=True)
-
-
-                    #st.session_state['historyTab'] = [history_tab,col1,col2,col3,PlaceHolder,col1_cont,col2_cont,col3_cont]
-
            with gallery_tab:
                logger.info(seeds)
                st.session_state["gallery"].text = ""
--- a/scripts/txt2vid.py
+++ b/scripts/txt2vid.py
@ -377,10 +377,9 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):

            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-                print(
-                                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                                )
+                print("The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
        else:
@ -613,7 +612,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):

    def walk(
            self,
-                prompts: Optional[List[str]] = None,
+                prompt: Optional[List[str]] = None,
                seeds: Optional[List[int]] = None,
                num_interpolation_steps: Optional[Union[int, List[int]]] = 5,  # int or list of int
                output_dir: Optional[str] = "./dreams",
@ -1108,7 +1107,7 @@ def load_diffusers_model(weights_path,torch_device):
                    model_path = os.path.join("models", "diffusers", "stable-diffusion-v1-5")

                if not os.path.exists(model_path + "/model_index.json"):
-                    server_state["pipe"] = StableDiffusionWalkPipeline.from_pretrained(
+                    server_state["pipe"] = StableDiffusionPipeline.from_pretrained(
                                            weights_path,
                                                use_local_file=True,
                                                use_auth_token=st.session_state["defaults"].general.huggingface_token,
@ -1116,11 +1115,12 @@ def load_diffusers_model(weights_path,torch_device):
                                                revision="fp16" if not st.session_state['defaults'].general.no_half else None,
                                                safety_checker=None,  # Very important for videos...lots of false positives while interpolating
                                                #custom_pipeline="interpolate_stable_diffusion",
+
                                        )

-                    StableDiffusionWalkPipeline.save_pretrained(server_state["pipe"], model_path)
+                    StableDiffusionPipeline.save_pretrained(server_state["pipe"], model_path)
                else:
-                    server_state["pipe"] = StableDiffusionWalkPipeline.from_pretrained(
+                    server_state["pipe"] = StableDiffusionPipeline.from_pretrained(
                                            model_path,
                                                use_local_file=True,
                                                torch_dtype=torch.float16 if st.session_state['defaults'].general.use_float16 else None,
@ -1166,9 +1166,11 @@ def load_diffusers_model(weights_path,torch_device):
        if "huggingface_token" not in st.session_state or st.session_state["defaults"].general.huggingface_token == "None":
            if "progress_bar_text" in st.session_state:
                st.session_state["progress_bar_text"].error(
-                                    "You need a huggingface token in order to use the Text to Video tab. Use the Settings page from the sidebar on the left to add your token."
-                                )
-            raise OSError("You need a huggingface token in order to use the Text to Video tab. Use the Settings page from the sidebar on the left to add your token.")
+                    "You need a huggingface token in order to use the Text to Video tab. Use the Settings page to add your token under the Huggingface section. "
+                    "Make sure you save your settings after adding it."
+                )
+            raise OSError("You need a huggingface token in order to use the Text to Video tab. Use the Settings page to add your token under the Huggingface section. "
+                          "Make sure you save your settings after adding it.")
        else:
            if "progress_bar_text" in st.session_state:
                st.session_state["progress_bar_text"].error(e)
@ -1434,29 +1436,30 @@ def txt2vid(
        # works correctly generating all frames but do not show the preview image
        # we also do not have control over the generation and cant stop it until the end of it.
        #with torch.autocast("cuda"):
+            #print (prompts)
            #video_path = server_state["pipe"].walk(
-                            #prompts=prompts,
-                                #seeds=seeds,
-                                #num_interpolation_steps=num_steps,
-                                #height=height,  # use multiples of 64 if > 512. Multiples of 8 if < 512.
-                                #width=width,   # use multiples of 64 if > 512. Multiples of 8 if < 512.
-                                #batch_size=4,
-                                #fps=30,
-                                #image_file_ext = ".png",
-                                #eta = 0.0,
-                                #output_dir=full_path,        # Where images/videos will be saved
-                                ##name='test',        # Subdirectory of output_dir where images/videos will be saved
-                                #guidance_scale=cfg_scale,         # Higher adheres to prompt more, lower lets model take the wheel
-                                #num_inference_steps=num_inference_steps,     # Number of diffusion steps per image generated. 50 is good default
-                                #upsample = False,
-                                ##do_loop=do_loop,           # Change to True if you want last prompt to loop back to first prompt
-                                #resume = False,
-                                #audio_filepath = None,
-                                #audio_start_sec = None,
-                                #margin = 1.0,
-                                #smooth = 0.0,
-                                #callback=txt2vid_generation_callback, # our callback function will be called with the arguments callback(step, timestep, latents)
-                                #callback_steps=1 # our callback function will be called once this many steps are processed in a single frame
+                            #prompt=prompts,
+                            #seeds=seeds,
+                            #num_interpolation_steps=num_steps,
+                            #height=height,  # use multiples of 64 if > 512. Multiples of 8 if < 512.
+                            #width=width,   # use multiples of 64 if > 512. Multiples of 8 if < 512.
+                            #batch_size=4,
+                            #fps=30,
+                            #image_file_ext = ".png",
+                            #eta = 0.0,
+                            #output_dir=full_path,        # Where images/videos will be saved
+                            ##name='test',        # Subdirectory of output_dir where images/videos will be saved
+                            #guidance_scale=cfg_scale,         # Higher adheres to prompt more, lower lets model take the wheel
+                            #num_inference_steps=num_inference_steps,     # Number of diffusion steps per image generated. 50 is good default
+                            #upsample = False,
+                            ##do_loop=do_loop,           # Change to True if you want last prompt to loop back to first prompt
+                            #resume = False,
+                            #audio_filepath = None,
+                            #audio_start_sec = None,
+                            #margin = 1.0,
+                            #smooth = 0.0,
+                            #callback=txt2vid_generation_callback, # our callback function will be called with the arguments callback(step, timestep, latents)
+                            #callback_steps=1 # our callback function will be called once this many steps are processed in a single frame
                        #)

        # old code
--- a/scripts/webui_streamlit.py
+++ b/scripts/webui_streamlit.py
@ -123,7 +123,10 @@ def layout():
        # specify the primary menu definition
        menu_data = [
            {'id': 'Stable Diffusion', 'label': 'Stable Diffusion', 'icon': 'bi bi-grid-1x2-fill'},
-            {'id': 'Textual Inversion', 'label': 'Textual Inversion', 'icon': 'bi bi-lightbulb-fill'},
+            {'id': 'Train','label':"Train", 'icon': "bi bi-lightbulb-fill", 'submenu':[
+                {'id': 'Textual Inversion', 'label': 'Textual Inversion', 'icon': 'bi bi-lightbulb-fill'},
+                {'id': 'Fine Tunning', 'label': 'Fine Tunning', 'icon': 'bi bi-lightbulb-fill'},
+                ]},
            {'id': 'Model Manager', 'label': 'Model Manager', 'icon': 'bi bi-cloud-arrow-down-fill'},
            {'id': 'Tools','label':"Tools", 'icon': "bi bi-tools", 'submenu':[
                {'id': 'API Server', 'label': 'API Server', 'icon': 'bi bi-server'},
@ -188,8 +191,9 @@ def layout():
                st.experimental_rerun()

            txt2img_tab, img2img_tab, txt2vid_tab, img2txt_tab, concept_library_tab = st.tabs(["Text-to-Image", "Image-to-Image",
+                                                                                               #"Inpainting",
                                                                                               "Text-to-Video", "Image-To-Text",
-                                                                                                           "Concept Library"])
+                                                                                               "Concept Library"])
            #with home_tab:
                    #from home import layout
                    #layout()
@ -229,6 +233,11 @@ def layout():
            from textual_inversion import layout
            layout()

+        elif menu_id == 'Fine Tunning':
+            #from textual_inversion import layout
+            #layout()
+            st.info("Under Construction. :construction_worker:")
+
        elif menu_id == 'API Server':
            set_page_title("API Server - Stable Diffusion Playground")
            from APIServer import layout