mirror of
https://github.com/sd-webui/stable-diffusion-webui.git
synced 2024-12-14 23:02:00 +03:00
Removed the "find_noise_for_image.py" and "matched_noise.py" scripts as their content is now part of "sd_utils.py"
This commit is contained in:
parent
71be8f09f2
commit
2e684f948f
@ -1,57 +0,0 @@
|
|||||||
# Credit to trygvebw for this implementation
|
|
||||||
# Original here: https://gist.github.com/trygvebw/c71334dd127d537a15e9d59790f7f5e1
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import k_diffusion as K
|
|
||||||
from tqdm.auto import trange, tqdm
|
|
||||||
|
|
||||||
|
|
||||||
def find_noise_for_image(model, device, init_image, prompt, steps=200, cond_scale=2.0, verbose=False, normalize=False, generation_callback=None):
|
|
||||||
image = np.array(init_image).astype(np.float32) / 255.0
|
|
||||||
image = image[None].transpose(0, 3, 1, 2)
|
|
||||||
image = torch.from_numpy(image)
|
|
||||||
image = 2. * image - 1.
|
|
||||||
image = image.to(device)
|
|
||||||
x = model.get_first_stage_encoding(model.encode_first_stage(image))
|
|
||||||
|
|
||||||
uncond = model.get_learned_conditioning([''])
|
|
||||||
cond = model.get_learned_conditioning([prompt])
|
|
||||||
|
|
||||||
s_in = x.new_ones([x.shape[0]])
|
|
||||||
dnw = K.external.CompVisDenoiser(model)
|
|
||||||
sigmas = dnw.get_sigmas(steps).flip(0)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(sigmas)
|
|
||||||
|
|
||||||
for i in trange(1, len(sigmas)):
|
|
||||||
x_in = torch.cat([x] * 2)
|
|
||||||
sigma_in = torch.cat([sigmas[i - 1] * s_in] * 2)
|
|
||||||
cond_in = torch.cat([uncond, cond])
|
|
||||||
|
|
||||||
c_out, c_in = [K.utils.append_dims(k, x_in.ndim) for k in dnw.get_scalings(sigma_in)]
|
|
||||||
|
|
||||||
if i == 1:
|
|
||||||
t = dnw.sigma_to_t(torch.cat([sigmas[i] * s_in] * 2))
|
|
||||||
else:
|
|
||||||
t = dnw.sigma_to_t(sigma_in)
|
|
||||||
|
|
||||||
eps = model.apply_model(x_in * c_in, t, cond=cond_in)
|
|
||||||
denoised_uncond, denoised_cond = (x_in + eps * c_out).chunk(2)
|
|
||||||
|
|
||||||
denoised = denoised_uncond + (denoised_cond - denoised_uncond) * cond_scale
|
|
||||||
|
|
||||||
if i == 1:
|
|
||||||
d = (x - denoised) / (2 * sigmas[i])
|
|
||||||
else:
|
|
||||||
d = (x - denoised) / sigmas[i - 1]
|
|
||||||
|
|
||||||
if generation_callback is not None:
|
|
||||||
generation_callback(x, i)
|
|
||||||
|
|
||||||
dt = sigmas[i] - sigmas[i - 1]
|
|
||||||
x = x + d * dt
|
|
||||||
|
|
||||||
return x / sigmas[-1]
|
|
||||||
|
|
@ -1,152 +0,0 @@
|
|||||||
# CREDIT TO parlance-zz for discovering and implementing this method
|
|
||||||
# This code is from https://github.com/parlance-zz/g-diffuser-bot
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import skimage
|
|
||||||
|
|
||||||
# helper fft routines that keep ortho normalization and auto-shift before and after fft
|
|
||||||
def _fft2(data):
|
|
||||||
if data.ndim > 2: # has channels
|
|
||||||
out_fft = np.zeros((data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128)
|
|
||||||
for c in range(data.shape[2]):
|
|
||||||
c_data = data[:,:,c]
|
|
||||||
out_fft[:,:,c] = np.fft.fft2(np.fft.fftshift(c_data),norm="ortho")
|
|
||||||
out_fft[:,:,c] = np.fft.ifftshift(out_fft[:,:,c])
|
|
||||||
else: # one channel
|
|
||||||
out_fft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
|
|
||||||
out_fft[:,:] = np.fft.fft2(np.fft.fftshift(data),norm="ortho")
|
|
||||||
out_fft[:,:] = np.fft.ifftshift(out_fft[:,:])
|
|
||||||
|
|
||||||
return out_fft
|
|
||||||
|
|
||||||
def _ifft2(data):
|
|
||||||
if data.ndim > 2: # has channels
|
|
||||||
out_ifft = np.zeros((data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128)
|
|
||||||
for c in range(data.shape[2]):
|
|
||||||
c_data = data[:,:,c]
|
|
||||||
out_ifft[:,:,c] = np.fft.ifft2(np.fft.fftshift(c_data),norm="ortho")
|
|
||||||
out_ifft[:,:,c] = np.fft.ifftshift(out_ifft[:,:,c])
|
|
||||||
else: # one channel
|
|
||||||
out_ifft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
|
|
||||||
out_ifft[:,:] = np.fft.ifft2(np.fft.fftshift(data),norm="ortho")
|
|
||||||
out_ifft[:,:] = np.fft.ifftshift(out_ifft[:,:])
|
|
||||||
|
|
||||||
return out_ifft
|
|
||||||
|
|
||||||
def _get_gaussian_window(width, height, std=3.14, mode=0):
|
|
||||||
|
|
||||||
window_scale_x = float(width / min(width, height))
|
|
||||||
window_scale_y = float(height / min(width, height))
|
|
||||||
|
|
||||||
window = np.zeros((width, height))
|
|
||||||
x = (np.arange(width) / width * 2. - 1.) * window_scale_x
|
|
||||||
for y in range(height):
|
|
||||||
fy = (y / height * 2. - 1.) * window_scale_y
|
|
||||||
if mode == 0:
|
|
||||||
window[:, y] = np.exp(-(x**2+fy**2) * std)
|
|
||||||
else:
|
|
||||||
window[:, y] = (1/((x**2+1.) * (fy**2+1.))) ** (std/3.14) # hey wait a minute that's not gaussian
|
|
||||||
|
|
||||||
return window
|
|
||||||
|
|
||||||
def _get_masked_window_rgb(np_mask_grey, hardness=1.):
|
|
||||||
np_mask_rgb = np.zeros((np_mask_grey.shape[0], np_mask_grey.shape[1], 3))
|
|
||||||
if hardness != 1.:
|
|
||||||
hardened = np_mask_grey[:] ** hardness
|
|
||||||
else:
|
|
||||||
hardened = np_mask_grey[:]
|
|
||||||
for c in range(3):
|
|
||||||
np_mask_rgb[:,:,c] = hardened[:]
|
|
||||||
return np_mask_rgb
|
|
||||||
|
|
||||||
"""
|
|
||||||
Explanation:
|
|
||||||
Getting good results in/out-painting with stable diffusion can be challenging.
|
|
||||||
Although there are simpler effective solutions for in-painting, out-painting can be especially challenging because there is no color data
|
|
||||||
in the masked area to help prompt the generator. Ideally, even for in-painting we'd like work effectively without that data as well.
|
|
||||||
Provided here is my take on a potential solution to this problem.
|
|
||||||
|
|
||||||
By taking a fourier transform of the masked src img we get a function that tells us the presence and orientation of each feature scale in the unmasked src.
|
|
||||||
Shaping the init/seed noise for in/outpainting to the same distribution of feature scales, orientations, and positions increases output coherence
|
|
||||||
by helping keep features aligned. This technique is applicable to any continuous generation task such as audio or video, each of which can
|
|
||||||
be conceptualized as a series of out-painting steps where the last half of the input "frame" is erased. For multi-channel data such as color
|
|
||||||
or stereo sound the "color tone" or histogram of the seed noise can be matched to improve quality (using scikit-image currently)
|
|
||||||
This method is quite robust and has the added benefit of being fast independently of the size of the out-painted area.
|
|
||||||
The effects of this method include things like helping the generator integrate the pre-existing view distance and camera angle.
|
|
||||||
|
|
||||||
Carefully managing color and brightness with histogram matching is also essential to achieving good coherence.
|
|
||||||
|
|
||||||
noise_q controls the exponent in the fall-off of the distribution can be any positive number, lower values means higher detail (range > 0, default 1.)
|
|
||||||
color_variation controls how much freedom is allowed for the colors/palette of the out-painted area (range 0..1, default 0.01)
|
|
||||||
This code is provided as is under the Unlicense (https://unlicense.org/)
|
|
||||||
Although you have no obligation to do so, if you found this code helpful please find it in your heart to credit me [parlance-zz].
|
|
||||||
|
|
||||||
Questions or comments can be sent to parlance@fifth-harmonic.com (https://github.com/parlance-zz/)
|
|
||||||
This code is part of a new branch of a discord bot I am working on integrating with diffusers (https://github.com/parlance-zz/g-diffuser-bot)
|
|
||||||
|
|
||||||
"""
|
|
||||||
def get_matched_noise(_np_src_image, np_mask_rgb, noise_q, color_variation):
|
|
||||||
|
|
||||||
global DEBUG_MODE
|
|
||||||
global TMP_ROOT_PATH
|
|
||||||
|
|
||||||
width = _np_src_image.shape[0]
|
|
||||||
height = _np_src_image.shape[1]
|
|
||||||
num_channels = _np_src_image.shape[2]
|
|
||||||
|
|
||||||
np_src_image = _np_src_image[:] * (1. - np_mask_rgb)
|
|
||||||
np_mask_grey = (np.sum(np_mask_rgb, axis=2)/3.)
|
|
||||||
np_src_grey = (np.sum(np_src_image, axis=2)/3.)
|
|
||||||
all_mask = np.ones((width, height), dtype=bool)
|
|
||||||
img_mask = np_mask_grey > 1e-6
|
|
||||||
ref_mask = np_mask_grey < 1e-3
|
|
||||||
|
|
||||||
windowed_image = _np_src_image * (1.-_get_masked_window_rgb(np_mask_grey))
|
|
||||||
windowed_image /= np.max(windowed_image)
|
|
||||||
windowed_image += np.average(_np_src_image) * np_mask_rgb# / (1.-np.average(np_mask_rgb)) # rather than leave the masked area black, we get better results from fft by filling the average unmasked color
|
|
||||||
#windowed_image += np.average(_np_src_image) * (np_mask_rgb * (1.- np_mask_rgb)) / (1.-np.average(np_mask_rgb)) # compensate for darkening across the mask transition area
|
|
||||||
#_save_debug_img(windowed_image, "windowed_src_img")
|
|
||||||
|
|
||||||
src_fft = _fft2(windowed_image) # get feature statistics from masked src img
|
|
||||||
src_dist = np.absolute(src_fft)
|
|
||||||
src_phase = src_fft / src_dist
|
|
||||||
#_save_debug_img(src_dist, "windowed_src_dist")
|
|
||||||
|
|
||||||
noise_window = _get_gaussian_window(width, height, mode=1) # start with simple gaussian noise
|
|
||||||
noise_rgb = np.random.random_sample((width, height, num_channels))
|
|
||||||
noise_grey = (np.sum(noise_rgb, axis=2)/3.)
|
|
||||||
noise_rgb *= color_variation # the colorfulness of the starting noise is blended to greyscale with a parameter
|
|
||||||
for c in range(num_channels):
|
|
||||||
noise_rgb[:,:,c] += (1. - color_variation) * noise_grey
|
|
||||||
|
|
||||||
noise_fft = _fft2(noise_rgb)
|
|
||||||
for c in range(num_channels):
|
|
||||||
noise_fft[:,:,c] *= noise_window
|
|
||||||
noise_rgb = np.real(_ifft2(noise_fft))
|
|
||||||
shaped_noise_fft = _fft2(noise_rgb)
|
|
||||||
shaped_noise_fft[:,:,:] = np.absolute(shaped_noise_fft[:,:,:])**2 * (src_dist ** noise_q) * src_phase # perform the actual shaping
|
|
||||||
|
|
||||||
brightness_variation = 0.#color_variation # todo: temporarily tieing brightness variation to color variation for now
|
|
||||||
contrast_adjusted_np_src = _np_src_image[:] * (brightness_variation + 1.) - brightness_variation * 2.
|
|
||||||
|
|
||||||
# scikit-image is used for histogram matching, very convenient!
|
|
||||||
shaped_noise = np.real(_ifft2(shaped_noise_fft))
|
|
||||||
shaped_noise -= np.min(shaped_noise)
|
|
||||||
shaped_noise /= np.max(shaped_noise)
|
|
||||||
shaped_noise[img_mask,:] = skimage.exposure.match_histograms(shaped_noise[img_mask,:]**1., contrast_adjusted_np_src[ref_mask,:], channel_axis=1)
|
|
||||||
shaped_noise = _np_src_image[:] * (1. - np_mask_rgb) + shaped_noise * np_mask_rgb
|
|
||||||
#_save_debug_img(shaped_noise, "shaped_noise")
|
|
||||||
|
|
||||||
matched_noise = np.zeros((width, height, num_channels))
|
|
||||||
matched_noise = shaped_noise[:]
|
|
||||||
#matched_noise[all_mask,:] = skimage.exposure.match_histograms(shaped_noise[all_mask,:], _np_src_image[ref_mask,:], channel_axis=1)
|
|
||||||
#matched_noise = _np_src_image[:] * (1. - np_mask_rgb) + matched_noise * np_mask_rgb
|
|
||||||
|
|
||||||
#_save_debug_img(matched_noise, "matched_noise")
|
|
||||||
|
|
||||||
"""
|
|
||||||
todo:
|
|
||||||
color_variation doesnt have to be a single number, the overall color tone of the out-painted area could be param controlled
|
|
||||||
"""
|
|
||||||
|
|
||||||
return np.clip(matched_noise, 0., 1.)
|
|
Loading…
Reference in New Issue
Block a user