mirror of
https://github.com/Sygil-Dev/sygil-webui.git
synced 2024-12-15 06:21:34 +03:00
- Bumped the version of diffusers used on the txt2vid tab to be now v0.3.0.
- Added initial file for the textual inversion tab.
This commit is contained in:
parent
5f3d7facde
commit
ef2da42489
@ -22,7 +22,7 @@ dependencies:
|
||||
- accelerate==0.12.0
|
||||
- albumentations==0.4.3
|
||||
- basicsr>=1.3.4.0
|
||||
- diffusers==0.2.4
|
||||
- diffusers==0.3.0
|
||||
- einops==0.3.0
|
||||
- facexlib>=0.2.3
|
||||
- gradio==3.1.6
|
||||
@ -43,6 +43,7 @@ dependencies:
|
||||
- streamlit-option-menu==0.3.2
|
||||
- streamlit_nested_layout
|
||||
- test-tube>=0.7.5
|
||||
- tensorboard
|
||||
- torch-fidelity==0.3.0
|
||||
- torchmetrics==0.6.0
|
||||
- transformers==4.19.2
|
||||
|
@ -1,17 +0,0 @@
|
||||
# base webui import and utils.
|
||||
from webui_streamlit import st
|
||||
from sd_utils import *
|
||||
|
||||
# streamlit imports
|
||||
|
||||
|
||||
#other imports
|
||||
|
||||
# Temp imports
|
||||
|
||||
|
||||
# end of imports
|
||||
#---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
def layout():
|
||||
st.write("Textual Inversion")
|
53
scripts/textual_inversion.py
Normal file
53
scripts/textual_inversion.py
Normal file
@ -0,0 +1,53 @@
|
||||
# base webui import and utils.
|
||||
from webui_streamlit import st
|
||||
from sd_utils import *
|
||||
|
||||
# streamlit imports
|
||||
|
||||
|
||||
#other imports
|
||||
#from transformers import CLIPTextModel, CLIPTokenizer
|
||||
|
||||
# Temp imports
|
||||
|
||||
|
||||
# end of imports
|
||||
#---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
def load_learned_embed_in_clip(learned_embeds_path, text_encoder, tokenizer, token=None):
|
||||
loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
|
||||
|
||||
# separate token and the embeds
|
||||
trained_token = list(loaded_learned_embeds.keys())[0]
|
||||
embeds = loaded_learned_embeds[trained_token]
|
||||
|
||||
# cast to dtype of text_encoder
|
||||
dtype = text_encoder.get_input_embeddings().weight.dtype
|
||||
embeds.to(dtype)
|
||||
|
||||
# add the token in tokenizer
|
||||
token = token if token is not None else trained_token
|
||||
num_added_tokens = tokenizer.add_tokens(token)
|
||||
i = 1
|
||||
while(num_added_tokens == 0):
|
||||
print(f"The tokenizer already contains the token {token}.")
|
||||
token = f"{token[:-1]}-{i}>"
|
||||
print(f"Attempting to add the token {token}.")
|
||||
num_added_tokens = tokenizer.add_tokens(token)
|
||||
i+=1
|
||||
|
||||
# resize the token embeddings
|
||||
text_encoder.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# get the id for the token and assign the embeds
|
||||
token_id = tokenizer.convert_tokens_to_ids(token)
|
||||
text_encoder.get_input_embeddings().weight.data[token_id] = embeds
|
||||
return token
|
||||
|
||||
#def token_loader()
|
||||
learned_token = load_learned_embed_in_clip(f"models/custom/embeddings/Custom Ami.pt", pipe.text_encoder, pipe.tokenizer, "*")
|
||||
#model_content["token"] = learned_token
|
||||
#models.append(model_content)
|
||||
|
||||
def layout():
|
||||
st.write("Textual Inversion")
|
@ -147,14 +147,13 @@ def diffuse(
|
||||
|
||||
#scale and decode the image latents with vae
|
||||
cond_latents_2 = 1 / 0.18215 * cond_latents
|
||||
image_2 = pipe.vae.decode(cond_latents_2)
|
||||
image = pipe.vae.decode(cond_latents_2)
|
||||
|
||||
# generate output numpy image as uint8
|
||||
image_2 = (image_2 / 2 + 0.5).clamp(0, 1)
|
||||
image_2 = image_2.cpu().permute(0, 2, 3, 1).numpy()
|
||||
image_2 = (image_2[0] * 255).astype(np.uint8)
|
||||
image = torch.clamp((image["sample"] + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
image = transforms.ToPILImage()(image.squeeze_(0))
|
||||
|
||||
st.session_state["preview_image"].image(image_2)
|
||||
st.session_state["preview_image"].image(image)
|
||||
|
||||
step_counter = 0
|
||||
|
||||
@ -186,15 +185,6 @@ def diffuse(
|
||||
)
|
||||
st.session_state["progress_bar"].progress(percent if percent < 100 else 100)
|
||||
|
||||
# scale and decode the image latents with vae
|
||||
cond_latents = 1 / 0.18215 * cond_latents
|
||||
image = pipe.vae.decode(cond_latents)
|
||||
|
||||
# generate output numpy image as uint8
|
||||
image = (image / 2 + 0.5).clamp(0, 1)
|
||||
image = image.cpu().permute(0, 2, 3, 1).numpy()
|
||||
image = (image[0] * 255).astype(np.uint8)
|
||||
|
||||
return image
|
||||
|
||||
#
|
||||
@ -223,7 +213,8 @@ def txt2vid(
|
||||
#-----------------------------------------------
|
||||
beta_start = 0.0001,
|
||||
beta_end = 0.00012,
|
||||
beta_schedule = "scaled_linear"
|
||||
beta_schedule = "scaled_linear",
|
||||
starting_image=None
|
||||
):
|
||||
"""
|
||||
prompt = ["blueberry spaghetti", "strawberry spaghetti"], # prompt to dream about
|
||||
@ -385,7 +376,8 @@ def txt2vid(
|
||||
# get the conditional text embeddings based on the prompt
|
||||
text_input = st.session_state["pipe"].tokenizer(prompts, padding="max_length", max_length=st.session_state["pipe"].tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
||||
cond_embeddings = st.session_state["pipe"].text_encoder(text_input.input_ids.to(torch_device))[0] # shape [1, 77, 768]
|
||||
|
||||
|
||||
|
||||
# sample a source
|
||||
init1 = torch.randn((1, st.session_state["pipe"].unet.in_channels, height // 8, width // 8), device=torch_device)
|
||||
|
||||
@ -427,15 +419,15 @@ def txt2vid(
|
||||
with autocast("cuda"):
|
||||
image = diffuse(st.session_state["pipe"], cond_embeddings, init, num_inference_steps, cfg_scale, eta)
|
||||
|
||||
im = Image.fromarray(image)
|
||||
#im = Image.fromarray(image)
|
||||
outpath = os.path.join(full_path, 'frame%06d.png' % frame_index)
|
||||
im.save(outpath, quality=quality)
|
||||
image.save(outpath, quality=quality)
|
||||
|
||||
# send the image to the UI to update it
|
||||
#st.session_state["preview_image"].image(im)
|
||||
|
||||
#append the frames to the frames list so we can use them later.
|
||||
frames.append(np.asarray(im))
|
||||
frames.append(np.asarray(image))
|
||||
|
||||
#increase frame_index counter.
|
||||
frame_index += 1
|
||||
@ -555,6 +547,9 @@ def layout():
|
||||
width = st.slider("Width:", min_value=64, max_value=2048, value=st.session_state['defaults'].txt2vid.width, step=64)
|
||||
height = st.slider("Height:", min_value=64, max_value=2048, value=st.session_state['defaults'].txt2vid.height, step=64)
|
||||
cfg_scale = st.slider("CFG (Classifier Free Guidance Scale):", min_value=1.0, max_value=30.0, value=st.session_state['defaults'].txt2vid.cfg_scale, step=0.5, help="How strongly the image should follow the prompt.")
|
||||
|
||||
#uploaded_images = st.file_uploader("Upload Image", accept_multiple_files=False, type=["png", "jpg", "jpeg"],
|
||||
#help="Upload an image which will be used for the image to image generation.")
|
||||
seed = st.text_input("Seed:", value=st.session_state['defaults'].txt2vid.seed, help=" The seed to use, if left blank a random seed will be generated.")
|
||||
#batch_count = st.slider("Batch count.", min_value=1, max_value=100, value=st.session_state['defaults'].txt2vid.batch_count, step=1, help="How many iterations or batches of images to generate in total.")
|
||||
#batch_size = st.slider("Batch size", min_value=1, max_value=250, value=st.session_state['defaults'].txt2vid.batch_size, step=1,
|
||||
@ -690,7 +685,7 @@ def layout():
|
||||
seeds=seed, quality=100, eta=0.0, width=width,
|
||||
height=height, weights_path=custom_model, scheduler=scheduler_name,
|
||||
disable_tqdm=False, fp=st.session_state.defaults.general.fp, beta_start=st.session_state["beta_start"], beta_end=st.session_state["beta_end"],
|
||||
beta_schedule=beta_scheduler_type)
|
||||
beta_schedule=beta_scheduler_type, starting_image=None)
|
||||
|
||||
#message.success('Done!', icon="✅")
|
||||
message.success('Render Complete: ' + info + '; Stats: ' + stats, icon="✅")
|
||||
|
Loading…
Reference in New Issue
Block a user