- Bumped the version of diffusers used on the txt2vid tab to be now v0.3.0.

- Added initial file for the textual inversion tab.
This commit is contained in:
ZeroCool940711 2022-09-16 11:50:22 -07:00
parent 5f3d7facde
commit ef2da42489
4 changed files with 70 additions and 38 deletions

View File

@ -22,7 +22,7 @@ dependencies:
- accelerate==0.12.0
- albumentations==0.4.3
- basicsr>=1.3.4.0
- diffusers==0.2.4
- diffusers==0.3.0
- einops==0.3.0
- facexlib>=0.2.3
- gradio==3.1.6
@ -43,6 +43,7 @@ dependencies:
- streamlit-option-menu==0.3.2
- streamlit_nested_layout
- test-tube>=0.7.5
- tensorboard
- torch-fidelity==0.3.0
- torchmetrics==0.6.0
- transformers==4.19.2

View File

@ -1,17 +0,0 @@
# base webui import and utils.
from webui_streamlit import st
from sd_utils import *
# streamlit imports
#other imports
# Temp imports
# end of imports
#---------------------------------------------------------------------------------------------------------------
def layout():
st.write("Textual Inversion")

View File

@ -0,0 +1,53 @@
# base webui import and utils.
from webui_streamlit import st
from sd_utils import *
# streamlit imports
#other imports
#from transformers import CLIPTextModel, CLIPTokenizer
# Temp imports
# end of imports
#---------------------------------------------------------------------------------------------------------------
def load_learned_embed_in_clip(learned_embeds_path, text_encoder, tokenizer, token=None):
loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
# separate token and the embeds
trained_token = list(loaded_learned_embeds.keys())[0]
embeds = loaded_learned_embeds[trained_token]
# cast to dtype of text_encoder
dtype = text_encoder.get_input_embeddings().weight.dtype
embeds.to(dtype)
# add the token in tokenizer
token = token if token is not None else trained_token
num_added_tokens = tokenizer.add_tokens(token)
i = 1
while(num_added_tokens == 0):
print(f"The tokenizer already contains the token {token}.")
token = f"{token[:-1]}-{i}>"
print(f"Attempting to add the token {token}.")
num_added_tokens = tokenizer.add_tokens(token)
i+=1
# resize the token embeddings
text_encoder.resize_token_embeddings(len(tokenizer))
# get the id for the token and assign the embeds
token_id = tokenizer.convert_tokens_to_ids(token)
text_encoder.get_input_embeddings().weight.data[token_id] = embeds
return token
#def token_loader()
learned_token = load_learned_embed_in_clip(f"models/custom/embeddings/Custom Ami.pt", pipe.text_encoder, pipe.tokenizer, "*")
#model_content["token"] = learned_token
#models.append(model_content)
def layout():
st.write("Textual Inversion")

View File

@ -147,14 +147,13 @@ def diffuse(
#scale and decode the image latents with vae
cond_latents_2 = 1 / 0.18215 * cond_latents
image_2 = pipe.vae.decode(cond_latents_2)
image = pipe.vae.decode(cond_latents_2)
# generate output numpy image as uint8
image_2 = (image_2 / 2 + 0.5).clamp(0, 1)
image_2 = image_2.cpu().permute(0, 2, 3, 1).numpy()
image_2 = (image_2[0] * 255).astype(np.uint8)
image = torch.clamp((image["sample"] + 1.0) / 2.0, min=0.0, max=1.0)
image = transforms.ToPILImage()(image.squeeze_(0))
st.session_state["preview_image"].image(image_2)
st.session_state["preview_image"].image(image)
step_counter = 0
@ -186,15 +185,6 @@ def diffuse(
)
st.session_state["progress_bar"].progress(percent if percent < 100 else 100)
# scale and decode the image latents with vae
cond_latents = 1 / 0.18215 * cond_latents
image = pipe.vae.decode(cond_latents)
# generate output numpy image as uint8
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).numpy()
image = (image[0] * 255).astype(np.uint8)
return image
#
@ -223,7 +213,8 @@ def txt2vid(
#-----------------------------------------------
beta_start = 0.0001,
beta_end = 0.00012,
beta_schedule = "scaled_linear"
beta_schedule = "scaled_linear",
starting_image=None
):
"""
prompt = ["blueberry spaghetti", "strawberry spaghetti"], # prompt to dream about
@ -385,7 +376,8 @@ def txt2vid(
# get the conditional text embeddings based on the prompt
text_input = st.session_state["pipe"].tokenizer(prompts, padding="max_length", max_length=st.session_state["pipe"].tokenizer.model_max_length, truncation=True, return_tensors="pt")
cond_embeddings = st.session_state["pipe"].text_encoder(text_input.input_ids.to(torch_device))[0] # shape [1, 77, 768]
# sample a source
init1 = torch.randn((1, st.session_state["pipe"].unet.in_channels, height // 8, width // 8), device=torch_device)
@ -427,15 +419,15 @@ def txt2vid(
with autocast("cuda"):
image = diffuse(st.session_state["pipe"], cond_embeddings, init, num_inference_steps, cfg_scale, eta)
im = Image.fromarray(image)
#im = Image.fromarray(image)
outpath = os.path.join(full_path, 'frame%06d.png' % frame_index)
im.save(outpath, quality=quality)
image.save(outpath, quality=quality)
# send the image to the UI to update it
#st.session_state["preview_image"].image(im)
#append the frames to the frames list so we can use them later.
frames.append(np.asarray(im))
frames.append(np.asarray(image))
#increase frame_index counter.
frame_index += 1
@ -555,6 +547,9 @@ def layout():
width = st.slider("Width:", min_value=64, max_value=2048, value=st.session_state['defaults'].txt2vid.width, step=64)
height = st.slider("Height:", min_value=64, max_value=2048, value=st.session_state['defaults'].txt2vid.height, step=64)
cfg_scale = st.slider("CFG (Classifier Free Guidance Scale):", min_value=1.0, max_value=30.0, value=st.session_state['defaults'].txt2vid.cfg_scale, step=0.5, help="How strongly the image should follow the prompt.")
#uploaded_images = st.file_uploader("Upload Image", accept_multiple_files=False, type=["png", "jpg", "jpeg"],
#help="Upload an image which will be used for the image to image generation.")
seed = st.text_input("Seed:", value=st.session_state['defaults'].txt2vid.seed, help=" The seed to use, if left blank a random seed will be generated.")
#batch_count = st.slider("Batch count.", min_value=1, max_value=100, value=st.session_state['defaults'].txt2vid.batch_count, step=1, help="How many iterations or batches of images to generate in total.")
#batch_size = st.slider("Batch size", min_value=1, max_value=250, value=st.session_state['defaults'].txt2vid.batch_size, step=1,
@ -690,7 +685,7 @@ def layout():
seeds=seed, quality=100, eta=0.0, width=width,
height=height, weights_path=custom_model, scheduler=scheduler_name,
disable_tqdm=False, fp=st.session_state.defaults.general.fp, beta_start=st.session_state["beta_start"], beta_end=st.session_state["beta_end"],
beta_schedule=beta_scheduler_type)
beta_schedule=beta_scheduler_type, starting_image=None)
#message.success('Done!', icon="✅")
message.success('Render Complete: ' + info + '; Stats: ' + stats, icon="")