This commit is contained in:
hlky 2022-10-02 19:48:30 +01:00 committed by GitHub
commit 81b41ce0d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
133 changed files with 173368 additions and 6002 deletions

View File

@ -1,4 +1,4 @@
models/
models/custom/
outputs/
src/
gfpgan/

View File

@ -1,3 +1,19 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Validate the model files on every container restart
# (useful to set to false after you're sure the model files are already in place)
VALIDATE_MODELS=true
@ -12,3 +28,5 @@ WEBUI_SCRIPT=webui.py
# Pass cli arguments to webui.py e.g:
# WEBUI_ARGS=--optimized --extra-models-cpu --gpu=1 --esrgan-gpu=1 --gfpgan-gpu=1
WEBUI_ARGS=
STREAMLIT_SERVER_HEADLESS=true

View File

@ -1,11 +0,0 @@
blank_issues_enabled: false
contact_links:
- name: WebUI developer repository
url: https://github.com/hlky/stable-diffusion-webui/issues/new/choose
about: MOST BUGS SHOULD GO HERE Have a bug related to the features? Please open a bug on the developer repository.
- name: Feature Request, Question or Suggestion
url: https://github.com/hlky/stable-diffusion-webui/discussions
about: Please create a discussion and see if folks have already solved it
- name: Colab version specific bug?
url: https://github.com/altryne/sd-webui-colab/issues/new/choose
about: Please open colab related bugs here

13
.github/dependabot.yml vendored Normal file
View File

@ -0,0 +1,13 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
target-branch: "dev"
schedule:
interval: "daily"

11
.github/sync.yml vendored
View File

@ -1,11 +0,0 @@
sd-webui/daisi:
- source: configs/
- source: data/
- source: frontend/
- source: ldm/
- source: models/
- source: outputs/
- source: optimizedSD/
- source: frontend/
- source: scripts/
dest: .

View File

@ -1,16 +0,0 @@
name: Sync Files
on:
push:
branches:
- dev
workflow_dispatch:
jobs:
sync:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@master
- name: Run GitHub File Sync
uses: BetaHuhn/repo-file-sync-action@v1
with:
GH_PAT: ${{ secrets.GH_PAT2 }}

2
.gitignore vendored
View File

@ -64,3 +64,5 @@ condaenv.*.requirements.txt
/gfpgan/*
/models/*
z_version_env.tmp
scripts/bridgeData.py
/user_data/*

49
.streamlit/config.toml Normal file
View File

@ -0,0 +1,49 @@
[global]
disableWatchdogWarning = false
showWarningOnDirectExecution = true
dataFrameSerialization = "arrow"
[logger]
level = "info"
messageFormat = "%(asctime)s %(message)s"
[client]
caching = true
displayEnabled = true
showErrorDetails = true
[runner]
magicEnabled = true
installTracer = false
fixMatplotlib = true
postScriptGC = true
fastReruns = false
[server]
folderWatchBlacklist = []
fileWatcherType = "auto"
cookieSecret = ""
headless = false
runOnSave = false
port = 8501
baseUrlPath = ""
enableCORS = true
enableXsrfProtection = true
maxUploadSize = 200
maxMessageSize = 200
enableWebsocketCompression = false
[browser]
serverAddress = "localhost"
gatherUsageStats = false
serverPort = 8501
[mapbox]
token = ""
[deprecation]
showfileUploaderEncoding = true
showPyplotGlobalUse = true
[theme]
base = "dark"

View File

@ -1,37 +1,59 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Assumes host environment is AMD64 architecture
# ARG TARGETPLATFORM
# We should use the Pytorch CUDA/GPU-enabled base image. See: https://hub.docker.com/r/pytorch/pytorch/tags
# FROM nvidia/cuda:11.3.1-runtime-ubuntu20.04
# This is used to allow building against AMD GPUs
# Annoyingly, you can't IF branch off of, say, TARGETGPU and set
# the Dockerfile's FROM based on that, so we have to have the user
# pass in the entire image path for us.
ARG PYTORCH_IMAGE=pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime
# To build against AMD, use
# --build-arg PYTORCH_IMAGE=rocm/pytorch:rocm5.2.3_ubuntu20.04_py3.7_pytorch_1.12.1
# Assumes AMD64 host architecture
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime
FROM ${PYTORCH_IMAGE}
WORKDIR /install
SHELL ["/bin/bash", "-c"]
RUN apt-get update && \
apt-get install -y wget git && \
apt-get install -y wget git build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY ./sd_requirements.txt /install/
RUN pip install -r /install/sd_requirements.txt
COPY ./requirements.txt /install/
RUN pip install -r /install/requirements.txt
COPY ./ext_requirements.txt /install
RUN pip install -r /install/ext_requirements.txt
COPY ./ui_requirements.txt /install/
RUN pip install -r /install/ui_requirements.txt
# From base image. We need opencv-python-headless so we uninstall here
RUN pip uninstall -y opencv-python && pip install opencv-python-headless==4.6.0.66
# Install font for prompt matrix
COPY /data/DejaVuSans.ttf /usr/share/fonts/truetype/
ENV PYTHONPATH=/sd
COPY ./models /sd/models
COPY ./configs /sd/configs
COPY ./frontend /sd/frontend
COPY ./ldm /sd/ldm
# COPY ./gfpgan/ /sd/
COPY ./optimizedSD /sd/optimizedSD
COPY ./scripts /sd/scripts
EXPOSE 7860 8501
COPY ./entrypoint.sh /sd/

View File

@ -8,7 +8,6 @@
- **[Windows](https://sd-webui.github.io/stable-diffusion-webui/docs/1.windows-installation.html)**
- **[Linux](https://sd-webui.github.io/stable-diffusion-webui/docs/2.linux-installation.html)**
### Want to ask a question or request a feature?
Come to our [Discord Server](https://discord.gg/gyXNe4NySY) or use [Discussions](https://github.com/sd-webui/stable-diffusion-webui/discussions).

View File

@ -1,3 +1,18 @@
#!/bin/sh
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Functionally equivalent to docker compose build
docker build . -t stable-diffusion-webui:dev

View File

@ -0,0 +1,21 @@
{
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30522,
"encoder_width": 768,
"add_cross_attention": true
}

View File

@ -0,0 +1,33 @@
image_root: '/export/share/datasets/vision/coco/images/'
ann_root: 'annotation'
coco_gt_root: 'annotation/coco_gt'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
# size of vit model; base or large
vit: 'base'
vit_grad_ckpt: False
vit_ckpt_layer: 0
batch_size: 32
init_lr: 1e-5
# vit: 'large'
# vit_grad_ckpt: True
# vit_ckpt_layer: 5
# batch_size: 16
# init_lr: 2e-6
image_size: 384
# generation configs
max_length: 20
min_length: 5
num_beams: 3
prompt: 'a picture of '
# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 5

View File

@ -0,0 +1,21 @@
{
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30524,
"encoder_width": 768,
"add_cross_attention": true
}

21
configs/blip/nlvr.yaml Normal file
View File

@ -0,0 +1,21 @@
image_root: '/export/share/datasets/vision/NLVR2/'
ann_root: 'annotation'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'
#size of vit model; base or large
vit: 'base'
batch_size_train: 16
batch_size_test: 64
vit_grad_ckpt: False
vit_ckpt_layer: 0
max_epoch: 15
image_size: 384
# optimizer
weight_decay: 0.05
init_lr: 3e-5
min_lr: 0

15
configs/blip/nocaps.yaml Normal file
View File

@ -0,0 +1,15 @@
image_root: '/export/share/datasets/vision/nocaps/'
ann_root: 'annotation'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
vit: 'base'
batch_size: 32
image_size: 384
max_length: 20
min_length: 5
num_beams: 3
prompt: 'a picture of '

View File

@ -0,0 +1,27 @@
train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
'/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
]
laion_path: ''
# size of vit model; base or large
vit: 'base'
vit_grad_ckpt: False
vit_ckpt_layer: 0
image_size: 224
batch_size: 75
queue_size: 57600
alpha: 0.4
# optimizer
weight_decay: 0.05
init_lr: 3e-4
min_lr: 1e-6
warmup_lr: 1e-6
lr_decay_rate: 0.9
max_epoch: 20
warmup_steps: 3000

View File

@ -0,0 +1,34 @@
image_root: '/export/share/datasets/vision/coco/images/'
ann_root: 'annotation'
dataset: 'coco'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
# size of vit model; base or large
vit: 'base'
batch_size_train: 32
batch_size_test: 64
vit_grad_ckpt: True
vit_ckpt_layer: 4
init_lr: 1e-5
# vit: 'large'
# batch_size_train: 16
# batch_size_test: 32
# vit_grad_ckpt: True
# vit_ckpt_layer: 12
# init_lr: 5e-6
image_size: 384
queue_size: 57600
alpha: 0.4
k_test: 256
negative_all_rank: True
# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 6

View File

@ -0,0 +1,34 @@
image_root: '/export/share/datasets/vision/flickr30k/'
ann_root: 'annotation'
dataset: 'flickr'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth'
# size of vit model; base or large
vit: 'base'
batch_size_train: 32
batch_size_test: 64
vit_grad_ckpt: True
vit_ckpt_layer: 4
init_lr: 1e-5
# vit: 'large'
# batch_size_train: 16
# batch_size_test: 32
# vit_grad_ckpt: True
# vit_ckpt_layer: 10
# init_lr: 5e-6
image_size: 384
queue_size: 57600
alpha: 0.4
k_test: 128
negative_all_rank: False
# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 6

View File

@ -0,0 +1,12 @@
video_root: '/export/share/dongxuli/data/msrvtt_retrieval/videos'
ann_root: 'annotation'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
# size of vit model; base or large
vit: 'base'
batch_size: 64
k_test: 128
image_size: 384
num_frm_test: 8

25
configs/blip/vqa.yaml Normal file
View File

@ -0,0 +1,25 @@
vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/
train_files: ['vqa_train','vqa_val','vg_qa']
ann_root: 'annotation'
# set pretrained as a file path or an url
pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
# size of vit model; base or large
vit: 'base'
batch_size_train: 16
batch_size_test: 32
vit_grad_ckpt: False
vit_ckpt_layer: 0
init_lr: 2e-5
image_size: 480
k_test: 128
inference: 'rank'
# optimizer
weight_decay: 0.05
min_lr: 0
max_epoch: 10

View File

@ -1,3 +1,19 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# UI defaults configuration file. Is read automatically if located at configs/webui/webui.yaml, or specify path via --defaults.
txt2img:

View File

@ -1,8 +1,27 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# UI defaults configuration file. It is automatically loaded if located at configs/webui/webui_streamlit.yaml.
# Any changes made here will be available automatically on the web app without having to stop it.
# You may add overrides in a file named "userconfig_streamlit.yaml" in this folder, which can contain any subset
# of the properties below.
general:
streamlit_telemetry: False
default_theme: dark
huggingface_token: ""
gpu: 0
outdir: outputs
default_model: "Stable Diffusion v1.4"
@ -11,15 +30,20 @@ general:
use_sd_concepts_library: True
sd_concepts_library_folder: "models/custom/sd-concepts-library"
GFPGAN_dir: "./src/gfpgan"
GFPGAN_model: "GFPGANv1.4"
LDSR_dir: "./models/ldsr"
LDSR_model: "model"
RealESRGAN_dir: "./src/realesrgan"
RealESRGAN_model: "RealESRGAN_x4plus"
LDSR_dir: "./src/latent-diffusion"
outdir_txt2img: outputs/txt2img-samples
outdir_img2img: outputs/img2img-samples
upscaling_method: "RealESRGAN"
outdir_txt2img: outputs/txt2img
outdir_img2img: outputs/img2img
gfpgan_cpu: False
esrgan_cpu: False
extra_models_cpu: False
extra_models_gpu: False
gfpgan_gpu: 0
esrgan_gpu: 0
save_metadata: True
save_format: "png"
skip_grid: False
@ -34,7 +58,7 @@ general:
optimized_turbo: False
optimized_config: "optimizedSD/v1-inference.yaml"
enable_attention_slicing: False
enable_minimal_memory_usage : False
enable_minimal_memory_usage: False
update_preview: True
update_preview_frequency: 10
@ -88,6 +112,7 @@ txt2img:
save_as_jpg: False
use_GFPGAN: False
use_RealESRGAN: False
use_LDSR: False
RealESRGAN_model: "RealESRGAN_x4plus"
variant_amount:
@ -101,7 +126,7 @@ txt2img:
txt2vid:
default_model: "CompVis/stable-diffusion-v1-4"
custom_models_list: ["CompVis/stable-diffusion-v1-4", "naclbit/trinart_stable_diffusion_v2", "hakurei/waifu-diffusion", "osanseviero/BigGAN-deep-128"]
custom_models_list: ["CompVis/stable-diffusion-v1-4", "hakurei/waifu-diffusion"]
prompt:
width:
value: 512
@ -271,5 +296,21 @@ img2img:
variant_seed: ""
write_info_files: True
img2txt:
batch_size: 420
blip_image_eval_size: 512
concepts_library:
concepts_per_page: 12
gfpgan:
strength: 100
textual_inversion:
pretrained_model_name_or_path: "models/ldm/stable-diffusion-v1-4"
tokenizer_name: ""
daisi_app:
running_on_daisi_io: False

26
daisi_app.py Normal file
View File

@ -0,0 +1,26 @@
import os, subprocess
import yaml
print (os.getcwd)
try:
with open("environment.yaml") as file_handle:
environment_data = yaml.load(file_handle, Loader=yaml.FullLoader)
except FileNotFoundError:
try:
with open(os.path.join("..", "environment.yaml")) as file_handle:
environment_data = yaml.load(file_handle, Loader=yaml.FullLoader)
except:
pass
try:
for dependency in environment_data["dependencies"]:
package_name, package_version = dependency.split("=")
os.system("pip install {}=={}".format(package_name, package_version))
except:
pass
try:
subprocess.run(['python', '-m', 'streamlit', "run" ,os.path.join("..","scripts/webui_streamlit.py"), "--theme.base dark"], stdout=subprocess.DEVNULL)
except FileExistsError:
subprocess.run(['python', '-m', 'streamlit', "run" ,"scripts/webui_streamlit.py", "--theme.base dark"], stdout=subprocess.DEVNULL)

10268
data/img2txt/artists.txt Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

397
data/img2txt/flavors.txt Normal file
View File

@ -0,0 +1,397 @@
#film
#myportfolio
#pixelart
#screenshotsaturday
#vfxfriday
1920s
1970s
1990s
20 megapixels
2d
2d game art
32k uhd
35mm lens
3840x2160
3d
4k
8k
8k 3d
8k resolution
I can't believe how beautiful this is
academic art
acrylic art
adafruit
aesthetic
aftereffects
airbrush art
ambient occlusion
ambrotype
american propaganda
anaglyph effect
anaglyph filter
anamorphic lens flare
androgynous
angelic photograph
angular
anime
anime aesthetic
antichrist
apocalypse art
apocalypse landscape
art
art deco
art on instagram
artstation hd
artstation hq
artwork
associated press photo
atmospheric
award winning
award-winning
backlight
beautiful
behance hd
bioluminescence
biomorphic
black and white
black background
blueprint
bob ross
bokeh
booru
bryce 3d
calotype
chalk art
character
charcoal drawing
chiaroscuro
childs drawing
chillwave
chromatic
cinematic
cinematic lighting
cinematic view
circuitry
cityscape
clean
close up
cluttered
colorful
colorized
commission for
complementary colors
concept art
concert poster
congruent
constructivism
contest winner
contrasting
cosmic horror
creative commons attribution
creepypasta
criterion collection
cryengine
cubism
cyanotype
d&d
da vinci
dark
dark and mysterious
darksynth
datamosh
daz3d
dc comics
demonic photograph
depth of field
destructive
detailed
detailed painting
deviantart
deviantart hd
digital illustration
digital painting
digitally enhanced
diorama
dramatic
dramatic lighting
dslr
dslr camera
dutch golden age
dye-transfer
dynamic composition
dynamic pose
dystopian art
egyptian art
elegant
elite
enchanting
epic
ethereal
extremely gendered
fantasy
fauvism
feminine
film grain
filmic
fine art
fisheye lens
flat colors
flat shading
flemish baroque
flickering light
flickr
fractalism
freakshow
fresco
full body
full of details
furaffinity
future tech
futuristic
genderless
geometric
glitch art
glitchy
glitter
global illumination
glorious
glowing lights
glowing neon
god rays
golden ratio
goth
gothic
greeble
groovy
grotesque
hall of mirrors
handsome
hard surface modeling
hd
hd mod
hdr
hellish
hellish background
henry moore
high contrast
high definition
high detail
high detailed
high dynamic range
high quality
high quality photo
high resolution
holographic
horror film
hyper realism
hyper-realistic
hypnotic
ilford hp5
ilya kuvshinov
imax
impressionism
infrared
ink drawing
inspirational
instax
intricate
intricate patterns
iridescent
irridescent
iso 200
isometric
kinetic
kodak ektar
kodak gold 200
kodak portra
lighthearted
logo
lomo
long exposure
long lens
lovecraftian
lovely
low contrast
low poly
lowbrow
luminescence
macabre
macro lens
macro photography
made of all of the above
made of beads and yarn
made of cardboard
made of cheese
made of crystals
made of feathers
made of flowers
made of glass
made of insects
made of liquid metal
made of mist
made of paperclips
made of plastic
made of rubber
made of trash
made of vines
made of wire
made of wrought iron
majestic
marble sculpture
marvel comics
masculine
masterpiece
matte background
matte drawing
matte painting
matte photo
maximalist
messy
minimalist
minimalistic
mist
mixed media
movie poster
movie still
multiple exposure
muted
mystical
national geographic photo
neon
nightmare
nightscape
octane render
official art
oil on canvas
ominous
ominous vibe
ornate
orthogonal
outlined art
outrun
painterly
panorama
parallax
pencil sketch
phallic
photo
photo taken with ektachrome
photo taken with fujifilm superia
photo taken with nikon d750
photo taken with provia
photocollage
photocopy
photoillustration
photorealistic
physically based rendering
picasso
pixel perfect
pixiv
playstation 5 screenshot
polished
polycount
pop art
post processing
poster art
pre-raphaelite
prerendered graphics
pretty
provia
ps1 graphics
psychedelic
quantum wavetracing
ray tracing
realism
redshift
reimagined by industrial light and magic
renaissance painting
rendered in cinema4d
rendered in maya
rendered in unreal engine
repeating pattern
retrowave
rich color palette
rim light
rococo
rough
rtx
rtx on
sabattier effect
sabattier filter
sanctuary
sci-fi
seapunk
sense of awe
sensual
shallow depth of field
sharp focus
shiny
shiny eyes
shot on 70mm
sketchfab
skeuomorphic
smokey background
smooth
soft light
soft mist
soviet propaganda
speedpainting
stained glass
steampunk
stipple
stock photo
stockphoto
storybook illustration
strange
streetscape
studio light
studio lighting
studio photography
studio portrait
stylish
sunrays shine upon it
surrealist
symmetrical
synthwave
tarot card
tattoo
telephoto lens
terragen
tesseract
thx sound
tilt shift
tintype photograph
toonami
trance compilation cd
trypophobia
ue5
uhd image
ukiyo-e
ultra detailed
ultra hd
ultra realistic
ultrafine detail
unreal engine
unreal engine 5
vaporwave
velvia
vibrant colors
vivid colors
volumetric lighting
voxel art
vray
vray tracing
wallpaper
watercolor
wavy
whimsical
white background
wiccan
wide lens
wimmelbilder
windows vista
windows xp
woodcut
xbox 360 graphics
y2k aesthetic
zbrush

95
data/img2txt/mediums.txt Normal file
View File

@ -0,0 +1,95 @@
a 3D render
a black and white photo
a bronze sculpture
a cartoon
a cave painting
a character portrait
a charcoal drawing
a child's drawing
a color pencil sketch
a colorized photo
a comic book panel
a computer rendering
a cross stitch
a cubist painting
a detailed drawing
a detailed matte painting
a detailed painting
a diagram
a digital painting
a digital rendering
a drawing
a fine art painting
a flemish Baroque
a gouache
a hologram
a hyperrealistic painting
a jigsaw puzzle
a low poly render
a macro photograph
a manga drawing
a marble sculpture
a matte painting
a microscopic photo
a mid-nineteenth century engraving
a minimalist painting
a mosaic
a painting
a pastel
a pencil sketch
a photo
a photocopy
a photorealistic painting
a picture
a pointillism painting
a polaroid photo
a pop art painting
a portrait
a poster
a raytraced image
a renaissance painting
a screenprint
a screenshot
a silk screen
a sketch
a statue
a still life
a stipple
a stock photo
a storybook illustration
a surrealist painting
a surrealist sculpture
a tattoo
a tilt shift photo
a watercolor painting
a wireframe diagram
a woodcut
an abstract drawing
an abstract painting
an abstract sculpture
an acrylic painting
an airbrush painting
an album cover
an ambient occlusion render
an anime drawing
an art deco painting
an art deco sculpture
an engraving
an etching
an illustration of
an impressionist painting
an ink drawing
an oil on canvas painting
an oil painting
an ultrafine detailed painting
chalk art
computer graphics
concept art
cyberpunk art
digital art
egyptian art
graffiti art
lineart
pixel art
poster art
vector art

200
data/img2txt/movements.txt Normal file
View File

@ -0,0 +1,200 @@
abstract art
abstract expressionism
abstract illusionism
academic art
action painting
aestheticism
afrofuturism
altermodern
american barbizon school
american impressionism
american realism
american romanticism
american scene painting
analytical art
antipodeans
arabesque
arbeitsrat für kunst
art & language
art brut
art deco
art informel
art nouveau
art photography
arte povera
arts and crafts movement
ascii art
ashcan school
assemblage
australian tonalism
auto-destructive art
barbizon school
baroque
bauhaus
bengal school of art
berlin secession
black arts movement
brutalism
classical realism
cloisonnism
cobra
color field
computer art
conceptual art
concrete art
constructivism
context art
crayon art
crystal cubism
cubism
cubo-futurism
cynical realism
dada
danube school
dau-al-set
de stijl
deconstructivism
digital art
ecological art
environmental art
excessivism
expressionism
fantastic realism
fantasy art
fauvism
feminist art
figuration libre
figurative art
figurativism
fine art
fluxus
folk art
funk art
furry art
futurism
generative art
geometric abstract art
german romanticism
gothic art
graffiti
gutai group
happening
harlem renaissance
heidelberg school
holography
hudson river school
hurufiyya
hypermodernism
hyperrealism
impressionism
incoherents
institutional critique
interactive art
international gothic
international typographic style
kinetic art
kinetic pointillism
kitsch movement
land art
les automatistes
les nabis
letterism
light and space
lowbrow
lyco art
lyrical abstraction
magic realism
magical realism
mail art
mannerism
massurrealism
maximalism
metaphysical painting
mingei
minimalism
modern european ink painting
modernism
modular constructivism
naive art
naturalism
neo-dada
neo-expressionism
neo-fauvism
neo-figurative
neo-primitivism
neo-romanticism
neoclassicism
neogeo
neoism
neoplasticism
net art
new objectivity
new sculpture
northwest school
nuclear art
objective abstraction
op art
optical illusion
orphism
panfuturism
paris school
photorealism
pixel art
plasticien
plein air
pointillism
pop art
pop surrealism
post-impressionism
postminimalism
pre-raphaelitism
precisionism
primitivism
private press
process art
psychedelic art
purism
qajar art
quito school
rasquache
rayonism
realism
regionalism
remodernism
renaissance
retrofuturism
rococo
romanesque
romanticism
samikshavad
serial art
shin hanga
shock art
socialist realism
sots art
space art
street art
stuckism
sumatraism
superflat
suprematism
surrealism
symbolism
synchromism
synthetism
sōsaku hanga
tachisme
temporary art
tonalism
toyism
transgressive art
ukiyo-e
underground comix
unilalianism
vancouver school
vanitas
verdadism
video art
viennese actionism
visual art
vorticism

18
data/img2txt/sites.txt Normal file
View File

@ -0,0 +1,18 @@
Artstation
behance
cg society
cgsociety
deviantart
dribble
flickr
instagram
pexels
pinterest
pixabay
pixiv
polycount
reddit
shutterstock
tumblr
unsplash
zbrush central

View File

@ -0,0 +1,40 @@
// blend it together and finish it with details
prompt: cute happy orange cat sitting at beach, beach in background, trending on artstation:1 cute happy cat:1
sampler_name:k_euler_a
ddim_steps: 35
denoising_strength: 0.55
variation: 3
initial_seed: 1
# put foreground onto background
size: 512, 512
color: 0,0,0
## create foreground
size:512,512
color:0,0,0,0
resize: 300, 300
pos: 256, 350
// select mask by probing some pixels from the image
mask_by_color_at: 15, 15, 15, 256, 85, 465, 100, 480
mask_by_color_threshold:80
mask_by_color_space: HLS
// some pixels inside the cat may be selected, remove them with mask_open
mask_open: 15
// there is still some background pixels left at the edge between cat and background
// grow the mask to get them as well
mask_grow: 15
// we want to remove whatever is masked:
mask_invert: True
####
prompt: cute happy orange cat, white background
ddim_steps: 25
variation: 1
## create background
prompt:beach landscape, beach with ocean in background, photographic, beautiful:1 red:-0.4

View File

@ -0,0 +1,50 @@
initial_seed: 2
// select background and img2img over it
mask_by_color_at: 64, 64
mask_invert: True
prompt: corgi
ddim_steps: 50
seed: 242886303
mask_mode: 0
denoising_strength: 0.8
//cfg_scale: 15
mask_restore: True
image_editor_mode:Mask
# estimate depth and transform the corgi in 3d
transform3d: True
transform3d_depth_near: 0.5
transform3d_depth_scale: 10
transform3d_from_hfov: 45
transform3d_to_hfov: 45
transform3d_from_pose: 0,0,0, 0,0,0
transform3d_to_pose: 0.5,0,0, 0,-5,0
transform3d_min_mask: 0
transform3d_max_mask: 255
transform3d_inpaint_radius: 1
transform3d_inpaint_method: 0
## put foreground onto background
size: 512, 512
### create foreground
size: 512, 512
mask_depth: True
mask_depth_model: 1
mask_depth_min: -0.05
mask_depth_max: 0.5
mask_depth_invert:False
####
prompt: corgi
ddim_steps: 25
seed: 242886303
### background
size: 512,512
color: #9F978D

View File

@ -0,0 +1,25 @@
// blend it together and finish it with some details
prompt: cute corgi at beach, trending on artstation
ddim_steps: 50
denoising_strength: 0.5
initial_seed: 2
# put foreground onto background
size: 512, 512
## create foreground
size: 512, 512
// estimate depth from image and select mask by depth
// https://huggingface.co/spaces/atsantiago/Monocular_Depth_Filter
mask_depth: True
mask_depth_min: -0.05
mask_depth_max: 0.4
mask_depth_invert:False
###
prompt: corgi
ddim_steps: 25
## create background
prompt:beach landscape, beach with ocean in background, photographic, beautiful:1 red:-0.4

View File

@ -0,0 +1,34 @@
// give it some polish and details
size: 512, 512
prompt: cute corgi at beach, intricate details, photorealistic, trending on artstation
variation: 0
seed: 1360051694
initial_seed: 5
# blend it together
prompt: beautiful corgi:1.5 cute corgi at beach, trending on artstation:1 photorealistic:1.5
ddim_steps: 50
denoising_strength: 0.5
variation: 0
## put foreground in front of background
size: 512, 512
### select foreground
size: 512, 512
// estimate depth from image and select mask by depth
// https://huggingface.co/spaces/atsantiago/Monocular_Depth_Filter
mask_depth: True
mask_depth_min: -0.05
mask_depth_max: 0.4
mask_depth_invert:False
#### create foreground
prompt: corgi
ddim_steps: 25
seed: 242886303
### create background
prompt:beach landscape, beach with ocean in background, photographic, beautiful:1 red:-0.4
variation: 3

View File

@ -0,0 +1,37 @@
size: 512,512
mask_blur: 6
prompt: fantasy landscape with castle and forest and waterfalls, trending on artstation
denoising_strength: 0.6
seed: 1
image_editor_mode: Mask
mask_mode: 0
mask_restore: True
# mask the left which contains artifacts
color: 255,255,255,0
blend:multiply
size: 100,512
pos: 50,256
# mask the top-left which contains lots of artifacts
color: 255,255,255,0
blend:multiply
size: 280,128
pos: 128,64
# go forward and turn head left to look at the left waterfalls
transform3d: True
transform3d_depth_scale: 10000
transform3d_from_hfov: 60
transform3d_to_hfov: 60
transform3d_from_pose: 0,0,0, 0,0,0
transform3d_to_pose: 4000,0,2000, 0,-50,0
transform3d_min_mask: 0
transform3d_max_mask: 255
transform3d_inpaint_radius: 5
transform3d_inpaint_method: 1
##
prompt: fantasy landscape with castle and forest and waterfalls, trending on artstation
seed: 1

8
docker-compose.amd.yml Normal file
View File

@ -0,0 +1,8 @@
services:
stable-diffusion:
build:
args:
PYTORCH_IMAGE: rocm/pytorch:rocm5.2.3_ubuntu20.04_py3.7_pytorch_1.12.1
devices:
- /dev/dri
- /dev/kfd

View File

@ -0,0 +1,10 @@
# Nvidia specific config
version: '3.3'
services:
stable-diffusion:
deploy:
resources:
reservations:
devices:
- capabilities: [ gpu ]

View File

@ -1,3 +1,19 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
version: '3.3'
services:
@ -12,6 +28,7 @@ services:
- .:/sd
- ./outputs:/sd/outputs
- ./model_cache:/sd/model_cache
- ~/.huggingface/token:/root/.huggingface/token
- root_profile:/root
ports:
- '7860:7860'
@ -21,11 +38,6 @@ services:
interval: 30s
timeout: 1s
retries: 10
deploy:
resources:
reservations:
devices:
- capabilities: [ gpu ]
volumes:
root_profile:

View File

@ -1,4 +1,19 @@
#!/bin/bash
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Use this script to reset your Docker-based Stable Diffusion environment
# This script will remove all cached files/models that are downloaded during your first startup

View File

@ -19,7 +19,6 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
# Initial Setup
> This is a windows guide. [To install on Linux, see this page.](2.linux-installation.md)
@ -126,5 +125,4 @@ into the `/stable-diffusion-webui/src/gfpgan/experiments/pretrained_models` dire
# Credits
> Big thanks to Arkitecc#0339 from the Stable Diffusion discord for the original guide (support them [here](https://ko-fi.com/arkitecc)).
> Modified by [Hafiidz](https://github.com/Hafiidz) with helps from sd-webui discord and team.

View File

@ -43,6 +43,19 @@ Other Notes:
* "Optional" packages commonly used with Stable Diffusion WebUI workflows such as, RealESRGAN, GFPGAN, will be installed by default.
* An older version of running Stable Diffusion WebUI using Docker exists here: https://github.com/sd-webui/stable-diffusion-webui/discussions/922
### But what about AMD?
There is tentative support for AMD GPUs through docker which can be enabled via `docker-compose.amd.yml`,
although this is still in the early stages. Right now, this _only_ works on native linux (not WSL2) due
to issues with AMDs support of GPU passthrough. You also _must_ have ROCm drivers installed on the host.
```
docker compose -f docker-compose.yml -f docker-compose.amd.yml ...
```
or, by setting
```
export COMPOSE_FILE=docker-compose.yml:docker-compose.amd.yml
```
in your `.profile` or through a tool like `direnv`
---

View File

@ -0,0 +1,20 @@
<!--
This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
Copyright 2022 sd-webui team.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
TBD

View File

@ -1,4 +1,19 @@
#!/bin/bash
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Starts the webserver inside the docker container
#
@ -9,7 +24,21 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
export PYTHONPATH=$SCRIPT_DIR
MODEL_DIR="${SCRIPT_DIR}/model_cache"
if [[ $PUBLIC_KEY ]]
then
mkdir -p ~/.ssh
chmod 700 ~/.ssh
cd ~/.ssh
echo $PUBLIC_KEY >> authorized_keys
chmod 700 -R ~/.ssh
cd /
service ssh start
echo "SSH Service Started"
fi
MODEL_DIR="${SCRIPT_DIR}/user_data/model_cache"
mkdir -p $MODEL_DIR
# Array of model files to pre-download
# local filename
# local path in container (no trailing slash)
@ -22,6 +51,17 @@ MODEL_FILES=(
'RealESRGAN_x4plus_anime_6B.pth src/realesrgan/experiments/pretrained_models https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth f872d837d3c90ed2e05227bed711af5671a6fd1c9f7d7e91c911a61f155e99da'
'project.yaml src/latent-diffusion/experiments/pretrained_models https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1 9d6ad53c5dafeb07200fb712db14b813b527edd262bc80ea136777bdb41be2ba'
'model.ckpt src/latent-diffusion/experiments/pretrained_models https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1 c209caecac2f97b4bb8f4d726b70ac2ac9b35904b7fc99801e1f5e61f9210c13'
'waifu-diffusion.ckpt models/custom https://huggingface.co/crumb/pruned-waifu-diffusion/resolve/main/model-pruned.ckpt 9b31355f90fea9933847175d4731a033f49f861395addc7e153f480551a24c25'
'trinart.ckpt models/custom https://huggingface.co/naclbit/trinart_stable_diffusion_v2/resolve/main/trinart2_step95000.ckpt c1799d22a355ba25c9ceeb6e3c91fc61788c8e274b73508ae8a15877c5dbcf63'
'model__base_caption.pth models/blip https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth 96ac8749bd0a568c274ebe302b3a3748ab9be614c737f3d8c529697139174086'
'pytorch_model.bin models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin f1a17cdbe0f36fec524f5cafb1c261ea3bbbc13e346e0f74fc9eb0460dedd0d3'
'config.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/config.json 8a09b467700c58138c29d53c605b34ebc69beaadd13274a8a2af8ad2c2f4032a'
'merges.txt models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/merges.txt 9fd691f7c8039210e0fced15865466c65820d09b63988b0174bfe25de299051a'
'preprocessor_config.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/preprocessor_config.json 910e70b3956ac9879ebc90b22fb3bc8a75b6a0677814500101a4c072bd7857bd'
'special_tokens_map.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/special_tokens_map.json f8c0d6c39aee3f8431078ef6646567b0aba7f2246e9c54b8b99d55c22b707cbf'
'tokenizer.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/tokenizer.json a83e0809aa4c3af7208b2df632a7a69668c6d48775b3c3fe4e1b1199d1f8b8f4'
'tokenizer_config.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/tokenizer_config.json deef455e52fa5e8151e339add0582e4235f066009601360999d3a9cda83b1129'
'vocab.json models/clip-vit-large-patch14 https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/vocab.json 3f0c4f7d2086b61b38487075278ea9ed04edb53a03cbb045b86c27190fa8fb69'
)
@ -68,33 +108,30 @@ else
validateDownloadModel ${model[0]} ${model[1]} ${model[2]} ${model[3]}
fi
done
mkdir -p ${MODEL_DIR}/stable-diffusion-v1-4
mkdir -p ${MODEL_DIR}/waifu-diffusion
ln -fs ${SCRIPT_DIR}/models/clip-vit-large-patch14/ ${MODEL_DIR}/stable-diffusion-v1-4/tokenizer
ln -fs ${SCRIPT_DIR}/models/clip-vit-large-patch14/ ${MODEL_DIR}/waifu-diffusion/tokenizer
fi
# Determine which webserver interface to launch (Streamlit vs Default: Gradio)
if [[ ! -z $WEBUI_SCRIPT && $WEBUI_SCRIPT == "webui_streamlit.py" ]]; then
launch_command="streamlit run scripts/${WEBUI_SCRIPT:-webui.py} $WEBUI_ARGS"
if [[ -e "${MODEL_DIR}/sd-concepts-library" ]]; then
cd ${MODEL_DIR}/sd-concepts-library
git pull
else
launch_command="python scripts/${WEBUI_SCRIPT:-webui.py} $WEBUI_ARGS"
cd ${MODEL_DIR}
git clone https://github.com/sd-webui/sd-concepts-library
fi
mkdir -p ${SCRIPT_DIR}/models/custom
ln -fs ${MODEL_DIR}/sd-concepts-library/sd-concepts-library ${SCRIPT_DIR}/models/custom
# Start webserver interface
launch_message="Starting Stable Diffusion WebUI... ${launch_command}..."
if [[ -z $WEBUI_RELAUNCH || $WEBUI_RELAUNCH == "true" ]]; then
n=0
while true; do
echo $launch_message
echo "export HF_HOME=${MODEL_DIR}" >> ~/.bashrc
echo "export XDG_CACHE_HOME=${MODEL_DIR}" >> ~/.bashrc
echo "export TRANSFORMERS_CACHE=${MODEL_DIR}" >> ~/.bashrc
source ~/.bashrc
cd $SCRIPT_DIR
launch_command="streamlit run ${SCRIPT_DIR}/scripts/webui_streamlit.py"
if (( $n > 0 )); then
echo "Relaunch count: ${n}"
fi
$launch_command
$launch_command
echo "entrypoint.sh: Process is ending. Relaunching in 0.5s..."
((n++))
sleep 0.5
done
else
echo $launch_message
$launch_command
fi
sleep infinity

View File

@ -1,7 +1,24 @@
name: ldm
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
channels:
- pytorch
- defaults
# Psst. If you change a dependency, make sure it's mirrored in the docker requirement
# files as well.
dependencies:
- cudatoolkit=11.3
- git
@ -11,39 +28,54 @@ dependencies:
- pytorch=1.11.0
- scikit-image=0.19.2
- torchvision=0.12.0
- loguru
- pip:
- -e .
- -e git+https://github.com/CompVis/taming-transformers#egg=taming-transformers
- -e git+https://github.com/openai/CLIP#egg=clip
- -e git+https://github.com/TencentARC/GFPGAN#egg=GFPGAN
- -e git+https://github.com/xinntao/Real-ESRGAN#egg=realesrgan
- -e git+https://github.com/hlky/k-diffusion-sd#egg=k_diffusion
- -e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion
- accelerate==0.12.0
- albumentations==0.4.3
- basicsr>=1.3.4.0
- diffusers==0.3.0
- einops==0.3.0
- einops==0.3.1
- facexlib>=0.2.3
- ftfy==6.1.1
- fairscale==0.4.4
- gradio==3.1.6
- hydralit==1.0.14
- hydralit_components==1.0.10
- imageio-ffmpeg==0.4.2
- imageio==2.9.0
- kornia==0.6
- omegaconf==2.1.1
- opencv-python-headless==4.6.0.66
- open-clip-torch==2.0.2
- pandas==1.4.3
- piexif==1.1.3
- pycocotools==2.0.5
- pycocoevalcap==1.2
- pudb==2019.2
- pynvml==11.4.1
- python-slugify>=6.1.2
- pytorch-lightning==1.4.2
- retry>=0.9.2
- streamlit==1.12.2
- regex
- realesrgan==0.3.0
- streamlit==1.13.0
- streamlit-on-Hover-tabs==1.0.1
- streamlit-option-menu==0.3.2
- streamlit_nested_layout
- streamlit-server-state==0.14.2
- streamlit-tensorboard==0.0.2
- test-tube>=0.7.5
- tensorboard
- tensorboard==2.10.1
- timm==0.6.7
- torch-fidelity==0.3.0
- torchmetrics==0.6.0
- transformers==4.19.2
- tensorflow==2.10.0
- tqdm==4.64.0

View File

@ -1,15 +0,0 @@
# Optional packages commonly used with Stable Diffusion workflow
# Upscalers
basicsr==1.4.2 # required by RealESRGAN
gfpgan==1.3.8 # GFPGAN
realesrgan==0.2.8 # RealESRGAN brings in GFPGAN as a requirement
-e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion #ldsr
# Orphaned Packages: No usage found
#albumentations
#imageio-ffmpeg
#pudb
#test-tube
#torch-fidelity

View File

@ -1,2 +0,0 @@
# fix for #386
* @altryne

View File

@ -163,7 +163,7 @@ div[id*="111"]{
align-self: center !important;
}
/* Selected tabs color */
button, input, optgroup, select, textarea {color: #9c85fb;!important}
button, input, optgroup, select, textarea {color: #9c85fb!important}
/* -or- text color wtf */
.text-gray-300{

View File

@ -1,3 +1,20 @@
/*
This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
Copyright 2022 sd-webui team.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
.wrap .m-12 svg { display:none!important; }
.wrap .m-12::before { content:"Loading..." }
.progress-bar { display:none!important; }

View File

@ -1,15 +1,33 @@
/*
This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
Copyright 2022 sd-webui team.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/***********************************************************
* Additional CSS for streamlit builtin components *
************************************************************/
/* Tab name (e.g. Text-to-Image) */
/* Tab name (e.g. Text-to-Image) //improve legibility*/
button[data-baseweb="tab"] {
font-size: 25px; //improve legibility
font-size: 25px;
}
/* Image Container (only appear after run finished) */
/* Image Container (only appear after run finished)//center the image, especially better looks in wide screen */
.css-du1fp8 {
justify-content: center; //center the image, especially better looks in wide screen
justify-content: center;
}
/* Streamlit header */
@ -17,18 +35,12 @@ button[data-baseweb="tab"] {
background-color: transparent;
}
/* Main streamlit container (below header) */
/* Main streamlit container (below header) //reduce the empty spaces*/
.css-18e3th9 {
padding-top: 2rem; //reduce the empty spaces
padding-top: 1rem;
}
/* @media only for widescreen, to ensure enough space to see all */
@media (min-width: 1024px) {
/* Main streamlit container (below header) */
.css-18e3th9 {
padding-top: 0px; //reduce the empty spaces, can go fully to the top on widescreen devices
}
}
/***********************************************************
* Additional CSS for streamlit custom/3rd party components *
@ -55,7 +67,7 @@ button[kind="header"] {
/* display: none;*/ /*suggested behavior by streamlit hover components*/
pointer-events: auto; /* enable interaction of the button even if parents intereaction disabled */
}
/* added to avoid main sectors (all element to the right of sidebar from) moving */
section[data-testid="stSidebar"] {
width: 3.5% !important;
@ -91,7 +103,7 @@ button[kind="header"] {
}
/***********************************************************
* Additional CSS for other elements
* Additional CSS for other elements
************************************************************/
button[data-baseweb="tab"] {
font-size: 20px;
@ -114,4 +126,21 @@ div.gallery:hover {
text-align: center;
position: relative;
top: 6px;
}
}
.row-widget.stButton {
text-align: center;
}
/********************************************************************
Hide anchor links on titles
*********************************************************************/
.css-15zrgzn {
display: none
}
.css-eczf16 {
display: none
}
.css-jn99sy {
display: none
}

View File

@ -1,3 +1,20 @@
/*
This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
Copyright 2022 sd-webui team.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
[data-testid="image"] {min-height: 512px !important}
* #body>.col:nth-child(2){
width:250%;
@ -36,14 +53,14 @@ input[type=number]:disabled { -moz-appearance: textfield; }
height: auto;
}
#highlight{
[id$="highlight"]{
font-size: 1.2rem
}
#highlight .uppercase{
[id$="highlight"] .uppercase{
text-transform: initial;
}
#highlight .textfield .textspan:nth-child(1){
[id$="highlight"] .textfield .textspan:nth-child(1){
font-size: 1.2rem
}
@ -70,4 +87,4 @@ input[type=number]:disabled { -moz-appearance: textfield; }
}
/* fix buttons layouts */
}
}

View File

@ -1,3 +1,18 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from os import path
import json
@ -12,7 +27,6 @@ def readTextFile(*args):
def css(opt):
styling = readTextFile("css", "styles.css")
# TODO: @altryne restore this before merge
if not opt.no_progressbar_hiding:
styling += readTextFile("css", "no_progress_bar.css")
return styling
@ -24,62 +38,6 @@ def js(opt):
return data
# TODO : @altryne fix this to the new JS format
js_copy_txt2img_output = "(x) => {navigator.clipboard.writeText(document.querySelector('gradio-app').shadowRoot.querySelector('#highlight .textfield').textContent.replace(/\s+/g,' ').replace(/: /g,':'))}"
js_parse_prompt ="""
(txt2img_prompt, txt2img_width, txt2img_height, txt2img_steps, txt2img_seed, txt2img_batch_count, txt2img_cfg) => {
const prompt_input = document.querySelector('gradio-app').shadowRoot.querySelector('#prompt_input [data-testid="textbox"]');
const multiline = document.querySelector('gradio-app').shadowRoot.querySelector('#submit_on_enter label:nth-child(2)')
if (prompt_input.scrollWidth > prompt_input.clientWidth + 10 ) {
multiline.click();
}
let height_match = /(?:-h|-H|--height|height)[ :]?(?<height>\d+) /.exec(txt2img_prompt);
if (height_match) {
txt2img_height = Math.round(height_match.groups.height / 64) * 64;
txt2img_prompt = txt2img_prompt.replace(height_match[0], '');
}
let width_match = /(?:-w|-W|--width|width)[ :]?(?<width>\d+) /.exec(txt2img_prompt);
if (width_match) {
txt2img_width = Math.round(width_match.groups.width / 64) * 64;
txt2img_prompt = txt2img_prompt.replace(width_match[0], '');
}
let steps_match = /(?:-s|--steps|steps)[ :]?(?<steps>\d+) /.exec(txt2img_prompt);
if (steps_match) {
txt2img_steps = steps_match.groups.steps.trim();
txt2img_prompt = txt2img_prompt.replace(steps_match[0], '');
}
let seed_match = /(?:-S|--seed|seed)[ :]?(?<seed>\d+) /.exec(txt2img_prompt);
if (seed_match) {
txt2img_seed = seed_match.groups.seed;
txt2img_prompt = txt2img_prompt.replace(seed_match[0], '');
}
let batch_count_match = /(?:-n|-N|--number|number)[ :]?(?<batch_count>\d+) /.exec(txt2img_prompt);
if (batch_count_match) {
txt2img_batch_count = batch_count_match.groups.batch_count;
txt2img_prompt = txt2img_prompt.replace(batch_count_match[0], '');
}
let cfg_scale_match = /(?:-c|-C|--cfg-scale|cfg_scale|cfg)[ :]?(?<cfgscale>\d\.?\d+?) /.exec(txt2img_prompt);
if (cfg_scale_match) {
txt2img_cfg = parseFloat(cfg_scale_match.groups.cfgscale).toFixed(1);
txt2img_prompt = txt2img_prompt.replace(cfg_scale_match[0], '');
}
let sampler_match = /(?:-A|--sampler|sampler)[ :]?(?<sampler>\w+) /.exec(txt2img_prompt);
if (sampler_match) {
txt2img_prompt = txt2img_prompt.replace(sampler_match[0], '');
}
return [txt2img_prompt, parseInt(txt2img_width), parseInt(txt2img_height), parseInt(txt2img_steps), txt2img_seed, parseInt(txt2img_batch_count), parseFloat(txt2img_cfg)];
}
"""
# Wrap the typical SD method call into async closure for ease of use
# Supplies the js function with a params object
# That includes all the passed arguments and input from Gradio: x

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -0,0 +1,54 @@
<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
<svg xmlns="http://www.w3.org/2000/svg">
<metadata>
<json>
<![CDATA[
{
"fontFamily": "lg",
"majorVersion": 2,
"minorVersion": 0,
"fontURL": "",
"copyright": "",
"license": "",
"licenseURL": "",
"description": "Font generated by IcoMoon.",
"version": "Version 2.0",
"fontId": "lg",
"psName": "lg",
"subFamily": "Regular",
"fullName": "lg"
}
]]>
</json>
</metadata>
<defs>
<font id="lg" horiz-adv-x="1024">
<font-face units-per-em="1024" ascent="960" descent="-64" />
<missing-glyph horiz-adv-x="1024" />
<glyph unicode="&#x20;" horiz-adv-x="512" d="" />
<glyph unicode="&#xe01a;" glyph-name="pause_circle_outline" data-tags="pause_circle_outline" d="M554 256.667v340h86v-340h-86zM512 84.667q140 0 241 101t101 241-101 241-241 101-241-101-101-241 101-241 241-101zM512 852.667q176 0 301-125t125-301-125-301-301-125-301 125-125 301 125 301 301 125zM384 256.667v340h86v-340h-86z" />
<glyph unicode="&#xe01d;" glyph-name="play_circle_outline" data-tags="play_circle_outline" d="M512 84.667q140 0 241 101t101 241-101 241-241 101-241-101-101-241 101-241 241-101zM512 852.667q176 0 301-125t125-301-125-301-301-125-301 125-125 301 125 301 301 125zM426 234.667v384l256-192z" />
<glyph unicode="&#xe033;" glyph-name="stack-2" data-tags="stack-2" d="M384 853.334h426.667q53 0 90.5-37.5t37.5-90.5v-426.667q0-53-37.5-90.5t-90.5-37.5h-426.667q-53 0-90.5 37.5t-37.5 90.5v426.667q0 53 37.5 90.5t90.5 37.5zM170.667 675.334v-547.333q0-17.667 12.5-30.167t30.167-12.5h547.333q-13.333-37.667-46.333-61.5t-74.333-23.833h-426.667q-53 0-90.5 37.5t-37.5 90.5v426.667q0 41.333 23.833 74.333t61.5 46.333zM810.667 768h-426.667q-17.667 0-30.167-12.5t-12.5-30.167v-426.667q0-17.667 12.5-30.167t30.167-12.5h426.667q17.667 0 30.167 12.5t12.5 30.167v426.667q0 17.667-12.5 30.167t-30.167 12.5z" />
<glyph unicode="&#xe070;" glyph-name="clear" data-tags="clear" d="M810 664.667l-238-238 238-238-60-60-238 238-238-238-60 60 238 238-238 238 60 60 238-238 238 238z" />
<glyph unicode="&#xe094;" glyph-name="arrow-left" data-tags="arrow-left" d="M426.667 768q17.667 0 30.167-12.5t12.5-30.167q0-18-12.667-30.333l-225.667-225.667h665q17.667 0 30.167-12.5t12.5-30.167-12.5-30.167-30.167-12.5h-665l225.667-225.667q12.667-12.333 12.667-30.333 0-17.667-12.5-30.167t-30.167-12.5q-18 0-30.333 12.333l-298.667 298.667q-12.333 13-12.333 30.333t12.333 30.333l298.667 298.667q12.667 12.333 30.333 12.333z" />
<glyph unicode="&#xe095;" glyph-name="arrow-right" data-tags="arrow-right" d="M597.333 768q18 0 30.333-12.333l298.667-298.667q12.333-12.333 12.333-30.333t-12.333-30.333l-298.667-298.667q-12.333-12.333-30.333-12.333-18.333 0-30.5 12.167t-12.167 30.5q0 18 12.333 30.333l226 225.667h-665q-17.667 0-30.167 12.5t-12.5 30.167 12.5 30.167 30.167 12.5h665l-226 225.667q-12.333 12.333-12.333 30.333 0 18.333 12.167 30.5t30.5 12.167z" />
<glyph unicode="&#xe0f2;" glyph-name="vertical_align_bottom" data-tags="vertical_align_bottom" d="M170 128.667h684v-86h-684v86zM682 384.667l-170-172-170 172h128v426h84v-426h128z" />
<glyph unicode="&#xe1ff;" glyph-name="apps" data-tags="apps" d="M682 84.667v172h172v-172h-172zM682 340.667v172h172v-172h-172zM426 596.667v172h172v-172h-172zM682 768.667h172v-172h-172v172zM426 340.667v172h172v-172h-172zM170 340.667v172h172v-172h-172zM170 84.667v172h172v-172h-172zM426 84.667v172h172v-172h-172zM170 596.667v172h172v-172h-172z" />
<glyph unicode="&#xe20c;" glyph-name="fullscreen" data-tags="fullscreen" d="M598 724.667h212v-212h-84v128h-128v84zM726 212.667v128h84v-212h-212v84h128zM214 512.667v212h212v-84h-128v-128h-84zM298 340.667v-128h128v-84h-212v212h84z" />
<glyph unicode="&#xe20d;" glyph-name="fullscreen_exit" data-tags="fullscreen_exit" d="M682 596.667h128v-84h-212v212h84v-128zM598 128.667v212h212v-84h-128v-128h-84zM342 596.667v128h84v-212h-212v84h128zM214 256.667v84h212v-212h-84v128h-128z" />
<glyph unicode="&#xe311;" glyph-name="zoom_in" data-tags="zoom_in" d="M512 512.667h-86v-86h-42v86h-86v42h86v86h42v-86h86v-42zM406 340.667q80 0 136 56t56 136-56 136-136 56-136-56-56-136 56-136 136-56zM662 340.667l212-212-64-64-212 212v34l-12 12q-76-66-180-66-116 0-197 80t-81 196 81 197 197 81 196-81 80-197q0-104-66-180l12-12h34z" />
<glyph unicode="&#xe312;" glyph-name="zoom_out" data-tags="zoom_out" d="M298 554.667h214v-42h-214v42zM406 340.667q80 0 136 56t56 136-56 136-136 56-136-56-56-136 56-136 136-56zM662 340.667l212-212-64-64-212 212v34l-12 12q-76-66-180-66-116 0-197 80t-81 196 81 197 197 81 196-81 80-197q0-104-66-180l12-12h34z" />
<glyph unicode="&#xe80d;" glyph-name="share" data-tags="share" d="M768 252.667c68 0 124-56 124-124s-56-126-124-126-124 58-124 126c0 10 0 20 2 28l-302 176c-24-22-54-34-88-34-70 0-128 58-128 128s58 128 128 128c34 0 64-12 88-34l300 174c-2 10-4 20-4 30 0 70 58 128 128 128s128-58 128-128-58-128-128-128c-34 0-64 14-88 36l-300-176c2-10 4-20 4-30s-2-20-4-30l304-176c22 20 52 32 84 32z" />
<glyph unicode="&#xe900;" glyph-name="rotate_left" data-tags="rotate_left" d="M554 764.667q126-16 213-112t87-226-87-226-213-112v86q92 16 153 87t61 165-61 165-153 87v-166l-194 190 194 194v-132zM302 156.667l62 62q46-34 106-44v-86q-96 12-168 68zM260 384.667q10-58 42-106l-60-60q-56 74-68 166h86zM304 574.667q-36-52-44-106h-86q12 90 70 166z" />
<glyph unicode="&#xe901;" glyph-name="rotate_right" data-tags="rotate_right" d="M720 278.667q34 46 44 106h86q-12-92-68-166zM554 174.667q60 10 106 44l62-62q-72-56-168-68v86zM850 468.667h-86q-10 60-44 106l62 60q58-72 68-166zM664 702.667l-194-190v166q-92-16-153-87t-61-165 61-165 153-87v-86q-126 16-213 112t-87 226 87 226 213 112v132z" />
<glyph unicode="&#xe902;" glyph-name="swap_horiz" data-tags="swap_horiz" d="M896 554.667l-170-170v128h-300v84h300v128zM298 468.667v-128h300v-84h-300v-128l-170 170z" />
<glyph unicode="&#xe903;" glyph-name="swap_vert" data-tags="swap_vert" d="M384 810.667l170-170h-128v-300h-84v300h-128zM682 212.667h128l-170-170-170 170h128v300h84v-300z" />
<glyph unicode="&#xe904;" glyph-name="facebook-with-circle" data-tags="facebook-with-circle" d="M512 952.32c-271.462 0-491.52-220.058-491.52-491.52s220.058-491.52 491.52-491.52 491.52 220.058 491.52 491.52-220.058 491.52-491.52 491.52zM628.429 612.659h-73.882c-8.755 0-18.483-11.52-18.483-26.829v-53.35h92.416l-13.978-76.083h-78.438v-228.403h-87.194v228.403h-79.104v76.083h79.104v44.749c0 64.205 44.544 116.378 105.677 116.378h73.882v-80.947z" />
<glyph unicode="&#xe905;" glyph-name="google-with-circle" data-tags="google+-with-circle" d="M512 952.32c-271.462 0-491.52-220.058-491.52-491.52s220.058-491.52 491.52-491.52 491.52 220.058 491.52 491.52-220.058 491.52-491.52 491.52zM483.686 249.805c-30.874-15.002-64.102-16.589-76.954-16.589-2.458 0-3.84 0-3.84 0s-1.178 0-2.765 0c-20.070 0-119.962 4.608-119.962 95.59 0 89.395 108.8 96.41 142.131 96.41h0.87c-19.251 25.702-15.258 51.61-15.258 51.61-1.69-0.102-4.147-0.205-7.168-0.205-12.544 0-36.762 1.997-57.549 15.411-25.498 16.384-38.4 44.288-38.4 82.893 0 109.107 119.142 113.51 120.32 113.613h118.989v-2.611c0-13.312-23.91-15.923-40.192-18.125-5.53-0.819-16.64-1.894-19.763-3.482 30.157-16.128 35.021-41.421 35.021-79.104 0-42.906-16.794-65.587-34.611-81.51-11.059-9.882-19.712-17.613-19.712-28.006 0-10.189 11.878-20.582 25.702-32.717 22.579-19.917 53.555-47.002 53.555-92.723 0-47.258-20.326-81.050-60.416-100.454zM742.4 460.8h-76.8v-76.8h-51.2v76.8h-76.8v51.2h76.8v76.8h51.2v-76.8h76.8v-51.2zM421.018 401.92c-2.662 0-5.325-0.102-8.038-0.307-22.733-1.69-43.725-10.189-58.88-24.013-15.053-13.619-22.733-30.822-21.658-48.179 2.304-36.403 41.37-57.702 88.832-54.323 46.694 3.379 77.824 30.31 75.571 66.714-2.15 34.202-31.898 60.109-75.827 60.109zM465.766 599.808c-12.39 43.52-32.358 56.422-63.386 56.422-3.328 0-6.707-0.512-9.933-1.382-13.466-3.84-24.166-15.053-30.106-31.744-6.093-16.896-6.451-34.509-1.229-54.579 9.472-35.891 34.97-61.901 60.672-61.901 3.379 0 6.758 0.41 9.933 1.382 28.109 7.885 45.722 50.79 34.048 91.802z" />
<glyph unicode="&#xe906;" glyph-name="pinterest-with-circle" data-tags="pinterest-with-circle" d="M512 952.32c-271.462 0-491.52-220.058-491.52-491.52s220.058-491.52 491.52-491.52 491.52 220.058 491.52 491.52-220.058 491.52-491.52 491.52zM545.638 344.32c-31.539 2.406-44.749 18.022-69.427 32.973-13.568-71.219-30.157-139.52-79.309-175.206-15.206 107.725 22.221 188.518 39.629 274.381-29.645 49.92 3.533 150.323 66.099 125.645 76.954-30.515-66.662-185.6 29.747-205.005 100.659-20.173 141.773 174.694 79.36 237.978-90.214 91.494-262.502 2.099-241.306-128.87 5.12-32 38.246-41.728 13.21-85.914-57.702 12.8-74.957 58.317-72.704 118.989 3.533 99.328 89.242 168.909 175.155 178.483 108.698 12.083 210.688-39.885 224.819-142.182 15.821-115.405-49.101-240.282-165.274-231.27z" />
<glyph unicode="&#xe907;" glyph-name="twitter-with-circle" data-tags="twitter-with-circle" d="M512 952.32c-271.462 0-491.52-220.058-491.52-491.52s220.058-491.52 491.52-491.52 491.52 220.058 491.52 491.52-220.058 491.52-491.52 491.52zM711.936 549.683c0.205-4.198 0.256-8.397 0.256-12.493 0-128-97.331-275.507-275.405-275.507-54.682 0-105.574 15.974-148.378 43.52 7.526-0.922 15.258-1.28 23.091-1.28 45.363 0 87.091 15.411 120.218 41.421-42.342 0.819-78.080 28.774-90.419 67.174 5.888-1.075 11.93-1.69 18.176-1.69 8.806 0 17.408 1.178 25.498 3.379-44.288 8.909-77.67 48.026-77.67 94.925v1.178c13.056-7.219 28.006-11.622 43.878-12.134-26.010 17.408-43.059 47.002-43.059 80.64 0 17.715 4.762 34.406 13.107 48.691 47.77-58.573 119.040-97.075 199.526-101.222-1.69 7.117-2.509 14.49-2.509 22.118 0 53.402 43.315 96.819 96.819 96.819 27.802 0 52.992-11.776 70.656-30.618 22.067 4.403 42.752 12.39 61.44 23.501-7.219-22.579-22.528-41.574-42.547-53.606 19.61 2.406 38.246 7.578 55.603 15.309-12.954-19.405-29.389-36.506-48.282-50.125z" />
<glyph unicode="&#xe908;" glyph-name="message-circle" data-tags="message-circle" d="M938.667 448.128v21.205c0 0.725-0.043 1.621-0.085 2.475-5.803 99.755-47.488 190.336-112.725 258.176-68.352 71.125-162.731 117.419-268.843 123.264-0.683 0.043-1.536 0.085-2.347 0.085h-20.864c-59.947 0.683-122.965-13.227-181.931-43.008-52.181-26.496-97.749-63.488-133.931-108.16-56.405-69.717-89.899-158.080-89.941-253.696-0.597-54.4 10.795-111.36 35.157-165.419l-75.605-226.859c-2.816-8.363-3.072-17.835 0-26.965 7.467-22.357 31.616-34.432 53.973-26.965l226.731 75.563c49.493-22.485 105.984-35.243 165.376-35.115 58.539 0.384 115.797 13.141 168.149 36.949 81.579 37.163 151.040 101.248 193.749 186.667 27.477 53.291 43.307 115.84 43.136 181.803zM853.333 447.872c0.128-52.267-12.459-101.333-33.664-142.464-34.176-68.352-88.832-118.827-153.259-148.139-41.387-18.859-86.827-28.971-133.376-29.269-52.096-0.128-101.163 12.459-142.293 33.664-10.624 5.504-22.528 6.059-33.067 2.56l-162.261-54.101 54.101 162.261c3.755 11.221 2.56 22.912-2.389 32.725-23.552 46.677-34.304 96.171-33.792 142.421 0.043 76.331 26.411 145.92 70.955 200.917 28.629 35.371 64.768 64.725 106.24 85.76 46.592 23.552 96.085 34.304 142.336 33.792h19.456c83.712-4.565 158.037-41.003 212.011-97.152 51.285-53.376 84.139-124.416 89.003-202.795z" />
<glyph unicode="&#xe909;" glyph-name="maximize-2" data-tags="maximize-2" d="M793.003 768l-225.835-225.835c-16.683-16.683-16.683-43.691 0-60.331s43.691-16.683 60.331 0l225.835 225.835v-153.003c0-23.552 19.115-42.667 42.667-42.667s42.667 19.115 42.667 42.667v256c0 5.803-1.152 11.307-3.243 16.341s-5.163 9.728-9.216 13.781c-0.043 0.043-0.043 0.043-0.085 0.085-3.925 3.925-8.619 7.083-13.781 9.216-5.035 2.091-10.539 3.243-16.341 3.243h-256c-23.552 0-42.667-19.115-42.667-42.667s19.115-42.667 42.667-42.667zM230.997 85.334l225.835 225.835c16.683 16.683 16.683 43.691 0 60.331s-43.691 16.683-60.331 0l-225.835-225.835v153.003c0 23.552-19.115 42.667-42.667 42.667s-42.667-19.115-42.667-42.667v-256c0-23.552 19.115-42.667 42.667-42.667h256c23.552 0 42.667 19.115 42.667 42.667s-19.115 42.667-42.667 42.667z" />
<glyph unicode="&#xe90a;" glyph-name="minimize-2" data-tags="minimize-2" d="M700.331 554.667l225.835 225.835c16.683 16.683 16.683 43.691 0 60.331s-43.691 16.683-60.331 0l-225.835-225.835v153.003c0 23.552-19.115 42.667-42.667 42.667s-42.667-19.115-42.667-42.667v-256c0-5.803 1.152-11.307 3.243-16.341s5.163-9.728 9.216-13.781c0.043-0.043 0.043-0.043 0.085-0.085 3.925-3.925 8.619-7.083 13.781-9.216 5.035-2.091 10.539-3.243 16.341-3.243h256c23.552 0 42.667 19.115 42.667 42.667s-19.115 42.667-42.667 42.667zM158.165 12.502l225.835 225.835v-153.003c0-23.552 19.115-42.667 42.667-42.667s42.667 19.115 42.667 42.667v256c0 5.803-1.152 11.307-3.243 16.341s-5.163 9.728-9.216 13.781c-0.043 0.043-0.043 0.043-0.085 0.085-4.096 4.053-8.789 7.125-13.781 9.216-5.035 2.091-10.539 3.243-16.341 3.243h-256c-23.552 0-42.667-19.115-42.667-42.667s19.115-42.667 42.667-42.667h153.003l-225.835-225.835c-16.683-16.683-16.683-43.691 0-60.331s43.691-16.683 60.331 0z" />
</font></defs></svg>

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

View File

@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Component Template</title>
<script type="module" crossorigin src="assets/index.aeaed602.js"></script>
<link rel="stylesheet" href="assets/index.4194368f.css">
</head>
<body>
<div id="app"></div>
</body>
</html>

View File

@ -1,18 +1,34 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import gradio as gr
from frontend.css_and_js import css, js, call_JS, js_parse_prompt, js_copy_txt2img_output
from frontend.css_and_js import css, js, call_JS
from frontend.job_manager import JobManager
import frontend.ui_functions as uifn
import uuid
import torch
import os
def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda x: x, txt2img_defaults={},
RealESRGAN=True, GFPGAN=True, LDSR=True,
def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda x: x, scn2img=lambda x: x,
txt2img_defaults={}, RealESRGAN=True, GFPGAN=True, LDSR=True,
txt2img_toggles={}, txt2img_toggle_defaults='k_euler', show_embeddings=False, img2img_defaults={},
img2img_toggles={}, img2img_toggle_defaults={}, sample_img2img=None, img2img_mask_modes=None,
img2img_resize_modes=None, imgproc_defaults={}, imgproc_mode_toggles={}, user_defaults={},
run_GFPGAN=lambda x: x, run_RealESRGAN=lambda x: x,
img2img_resize_modes=None, imgproc_defaults={}, imgproc_mode_toggles={},
scn2img_defaults={}, scn2img_toggles={}, scn2img_toggle_defaults={}, scn2img_define_args=lambda: ({},{},{}),
user_defaults={}, run_GFPGAN=lambda x: x, run_RealESRGAN=lambda x: x,
job_manager: JobManager = None) -> gr.Blocks:
with gr.Blocks(css=css(opt), analytics_enabled=False, title="Stable Diffusion WebUI") as demo:
with gr.Tabs(elem_id='tabss') as tabs:
@ -69,17 +85,26 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
output_txt2img_to_imglab = gr.Button("Send to Lab", visible=True)
output_txt2img_params = gr.Highlightedtext(label="Generation parameters", interactive=False,
elem_id='highlight')
elem_id='txt2img_highlight')
with gr.Group():
with gr.Row(elem_id='txt2img_output_row'):
output_txt2img_copy_params = gr.Button("Copy full parameters").click(
inputs=[output_txt2img_params], outputs=[],
_js=js_copy_txt2img_output,
fn=None, show_progress=False)
inputs=[output_txt2img_params],
outputs=[],
_js=call_JS(
'copyFullOutput',
fromId='txt2img_highlight'
),
fn=None, show_progress=False
)
output_txt2img_seed = gr.Number(label='Seed', interactive=False, visible=False)
output_txt2img_copy_seed = gr.Button("Copy only seed").click(
inputs=[output_txt2img_seed], outputs=[],
_js='(x) => navigator.clipboard.writeText(x)', fn=None, show_progress=False)
inputs=[output_txt2img_seed],
outputs=[],
_js=call_JS('gradioInputToClipboard'),
fn=None,
show_progress=False
)
output_txt2img_stats = gr.HTML(label='Stats')
with gr.Column():
txt2img_steps = gr.Slider(minimum=1, maximum=250, step=1, label="Sampling Steps",
@ -103,7 +128,7 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
txt2img_realesrgan_model_name = gr.Dropdown(label='RealESRGAN model',
choices=['RealESRGAN_x4plus',
'RealESRGAN_x4plus_anime_6B'],
value='RealESRGAN_x4plus',
value=txt2img_defaults['realesrgan_model_name'],
visible=False) # RealESRGAN is not None # invisible until removed) # TODO: Feels like I shouldnt slot it in here.
txt2img_ddim_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="DDIM ETA",
@ -219,7 +244,7 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
img2img_mask_blur_strength = gr.Slider(minimum=1, maximum=100, step=1,
label="How much blurry should the mask be? (to avoid hard edges)",
value=3, visible=True)
value=img2img_defaults['mask_blur_strength'], visible=True)
img2img_resize = gr.Radio(label="Resize mode",
choices=["Just resize", "Crop and resize",
@ -247,16 +272,27 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
gr.Markdown("Warning: This will clear your current image and mask settings!")
with gr.TabItem("Output info", id="img2img_output_info_tab"):
output_img2img_params = gr.Textbox(label="Generation parameters")
output_img2img_params = gr.Highlightedtext(
label="Generation parameters", interactive=False,
elem_id='img2img_highlight')
with gr.Row():
output_img2img_copy_params = gr.Button("Copy full parameters").click(
inputs=output_img2img_params, outputs=[],
_js='(x) => {navigator.clipboard.writeText(x.replace(": ",":"))}', fn=None,
inputs=output_img2img_params,
outputs=[],
_js=call_JS(
'copyFullOutput',
fromId='img2img_highlight'
),
fn=None,
show_progress=False)
output_img2img_seed = gr.Number(label='Seed', interactive=False, visible=False)
output_img2img_copy_seed = gr.Button("Copy only seed").click(
inputs=output_img2img_seed, outputs=[],
_js=call_JS("gradioInputToClipboard"), fn=None, show_progress=False)
inputs=output_img2img_seed,
outputs=[],
_js=call_JS("gradioInputToClipboard"),
fn=None,
show_progress=False
)
output_img2img_stats = gr.HTML(label='Stats')
gr.Markdown('# img2img settings')
@ -295,7 +331,7 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
img2img_realesrgan_model_name = gr.Dropdown(label='RealESRGAN model',
choices=['RealESRGAN_x4plus',
'RealESRGAN_x4plus_anime_6B'],
value='RealESRGAN_x4plus',
value=img2img_defaults['realesrgan_model_name'],
visible=RealESRGAN is not None) # TODO: Feels like I shouldnt slot it in here.
img2img_embeddings = gr.File(label="Embeddings file for textual inversion",
@ -605,6 +641,176 @@ def draw_gradio_ui(opt, img2img=lambda x: x, txt2img=lambda x: x, imgproc=lambda
imgproc_upscale_toggles.change(fn=uifn.toggle_options_gobig, inputs=[imgproc_upscale_toggles],
outputs=[gobig_group])
with gr.TabItem("Scene-to-Image", id='scn2img_tab'):
example_path = os.path.join("data","scn2img_examples")
files = os.listdir(example_path)
examples = {}
for fn in files:
filepath = os.path.join(example_path, str(fn))
with open(filepath, "r") as file:
examples[fn] = file.read()
with gr.Row(elem_id="tools_row"):
scn2img_btn = gr.Button("Generate", elem_id="generate", variant="primary")
with gr.Row().style(equal_height=False):
with gr.Column():
scn2img_seed = gr.Textbox(
label="Seed (blank to randomize, specify to use cache)", lines=1, max_lines=1,
value=scn2img_defaults["seed"]
)
scn2img_prompt = gr.Textbox(
label="Prompt Scene",
elem_id='scn2_img_input',
placeholder=examples[list(examples.keys())[0]],
lines=50,
max_lines=50,
value=scn2img_defaults['prompt'],
show_label=False
)
with gr.Column():
with gr.Tabs():
with gr.TabItem("Results", id="scn2img_results_tab"):
# gr.Markdown('#### Scn2Img Results')
output_scn2img_gallery = gr.Gallery(
label="Images",
elem_id="scn2img_gallery_output"
).style(grid=[3, 3, 3], height=80)
scn2img_job_ui = job_manager.draw_gradio_ui() if job_manager else None
with gr.Tabs():
with gr.TabItem("Generated image actions", id="scn2img_actions_tab"):
gr.Markdown("Select an image, then press one of the buttons below")
with gr.Row():
output_scn2img_copy_to_clipboard_btn = gr.Button("Copy to clipboard")
output_scn2img_copy_to_img2img_input_btn = gr.Button("Push to img2img input")
output_scn2img_copy_to_img2img_mask_btn = gr.Button("Push to img2img input mask")
gr.Markdown("Warning: This will clear your current img2img image and mask settings!")
with gr.TabItem("Output info", id="scn2img_output_info_tab"):
output_scn2img_params = gr.Highlightedtext(label="Generation parameters", interactive=False,
elem_id='scn2img_highlight')
with gr.Row():
output_scn2img_copy_params = gr.Button("Copy full parameters").click(
inputs=[output_scn2img_params],
outputs=[],
_js=call_JS(
'copyFullOutput',
fromId='scn2img_highlight'
),
fn=None, show_progress=False
)
output_scn2img_seed = gr.Number(label='Seed', interactive=False, visible=False)
output_scn2img_copy_seed = gr.Button("Copy only initial seed").click(
inputs=output_scn2img_seed, outputs=[],
_js=call_JS("gradioInputToClipboard"), fn=None, show_progress=False)
output_scn2img_stats = gr.HTML(label='Stats')
with gr.TabItem("SceneCode", id="scn2img_scncode_tab"):
output_scn2img_scncode = gr.HTML(label="SceneCode")
scn2img_toggles = gr.CheckboxGroup(label='', choices=scn2img_toggles,
value=scn2img_toggle_defaults, type="index")
scn2img_embeddings = gr.File(label="Embeddings file for textual inversion",
visible=show_embeddings)
with gr.TabItem("Docs", id="scn2img_docs_tab"):
parse_arg, function_args, function_args_ext = scn2img_define_args()
with gr.Tabs():
with gr.TabItem("syntax", id=f"scn2img_docs_syntax_tab"):
lines = [
"Scene-to-Image defines layers of images in markdown-like syntax.",
"",
"Markdown headings, e.g. '# layer0', define layers.",
"Layers are hierarchical, i.e. each layer can contain more layers.",
"Child layers are blended together by their image masks, like layers in image editors.",
"",
"The content of sections define the arguments for image generation.",
"Arguments are defined by lines of the form 'arg:value' or 'arg=value'.",
"",
"To invoke txt2img or img2img, they layer must contain the 'prompt' argument.",
"For img2img the layer must have child layers, the result of blending them will be the input image for img2img.",
"When no prompt is specified the layer can still be used for image composition and mask selection.",
]
gr.Markdown("\n".join(lines))
for func, ext in function_args_ext.items():
with gr.TabItem(func, id=f"scn2img_docs_{func}_tab"):
lines = []
for e in ext:
lines.append(f"#### Arguments for {e}")
if e not in function_args: continue
for argname,argtype in function_args[e].items():
lines.append(f" - {argname}: {argtype}")
gr.Markdown("\n".join(lines))
with gr.TabItem("Examples", id="scn2img_examples_tab"):
scn2img_examples = {}
with gr.Tabs():
for k, (example, content) in enumerate(examples.items()):
with gr.TabItem(example, id=f"scn2img_example_{k}_tab"):
scn2img_examples[example] = gr.Textbox(
label="Prompt Scene",
elem_id=f"scn2img_example_{k}",
value=content,
lines=50,
max_lines=50,
show_label=False,
interactive=True
)
output_scn2img_copy_to_img2img_input_btn.click(
uifn.copy_img_to_edit,
[output_scn2img_gallery],
[img2img_image_editor, tabs, img2img_image_editor_mode],
_js=call_JS("moveImageFromGallery",
fromId="scn2img_gallery_output",
toId="img2img_editor")
)
output_scn2img_copy_to_img2img_mask_btn.click(
uifn.copy_img_to_mask,
[output_scn2img_gallery],
[img2img_image_mask, tabs, img2img_image_editor_mode],
_js=call_JS("moveImageFromGallery",
fromId="scn2img_gallery_output",
toId="img2img_editor")
)
output_scn2img_copy_to_clipboard_btn.click(
fn=None,
inputs=output_scn2img_gallery,
outputs=[],
_js=call_JS("copyImageFromGalleryToClipboard",
fromId="scn2img_gallery_output")
)
scn2img_func = scn2img
scn2img_inputs = [
scn2img_prompt,
scn2img_toggles,
scn2img_seed,
scn2img_embeddings
]
scn2img_outputs = [
output_scn2img_gallery,
output_scn2img_seed,
output_scn2img_params,
output_scn2img_stats,
output_scn2img_scncode
]
# If a JobManager was passed in then wrap the Generate functions
if scn2img_job_ui:
scn2img_func, scn2img_inputs, scn2img_outputs = scn2img_job_ui.wrap_func(
func=scn2img_func,
inputs=scn2img_inputs,
outputs=scn2img_outputs,
)
scn2img_btn.click(
scn2img_func,
scn2img_inputs,
scn2img_outputs
)
"""
if GFPGAN is not None:
gfpgan_defaults = {

View File

@ -1,3 +1,18 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
''' Class to store image generation parameters to be stored as metadata in the image'''
from __future__ import annotations
from dataclasses import dataclass, asdict

View File

@ -1,3 +1,18 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
''' Provides simple job management for gradio, allowing viewing and stopping in-progress multi-batch generations '''
from __future__ import annotations
import gradio as gr

View File

@ -134,6 +134,16 @@ window.SD = (() => {
await this.copyToClipboard([item]);
}
async copyFullOutput ({ fromId }) {
const textField = this.el.get(`#${fromId} .textfield`);
if (!textField) {
SDclass.error(new Error(`Can't find textfield with the output!`));
}
const value = textField.textContent.replace(/\s+/g,' ').replace(/: /g,':');
await this.copyToClipboard(value)
}
clickFirstVisibleButton({ rowId }) {
const generateButtons = this.el.get(`#${rowId}`).querySelectorAll('.gr-button-primary');

View File

@ -1,3 +1,18 @@
# This file is part of stable-diffusion-webui (https://github.com/sd-webui/stable-diffusion-webui/).
# Copyright 2022 sd-webui team.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
import gradio as gr
from PIL import Image, ImageFont, ImageDraw, ImageFilter, ImageOps

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 840 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.4 MiB

BIN
images/sd-wui_logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 454 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 962 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

View File

@ -0,0 +1,101 @@
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from data.coco_karpathy_dataset import coco_karpathy_train, coco_karpathy_caption_eval, coco_karpathy_retrieval_eval
from data.nocaps_dataset import nocaps_eval
from data.flickr30k_dataset import flickr30k_train, flickr30k_retrieval_eval
from data.vqa_dataset import vqa_dataset
from data.nlvr_dataset import nlvr_dataset
from data.pretrain_dataset import pretrain_dataset
from transform.randaugment import RandomAugment
def create_dataset(dataset, config, min_scale=0.5):
normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
transform_train = transforms.Compose([
transforms.RandomResizedCrop(config['image_size'],scale=(min_scale, 1.0),interpolation=InterpolationMode.BICUBIC),
transforms.RandomHorizontalFlip(),
RandomAugment(2,5,isPIL=True,augs=['Identity','AutoContrast','Brightness','Sharpness','Equalize',
'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),
transforms.ToTensor(),
normalize,
])
transform_test = transforms.Compose([
transforms.Resize((config['image_size'],config['image_size']),interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
normalize,
])
if dataset=='pretrain':
dataset = pretrain_dataset(config['train_file'], config['laion_path'], transform_train)
return dataset
elif dataset=='caption_coco':
train_dataset = coco_karpathy_train(transform_train, config['image_root'], config['ann_root'], prompt=config['prompt'])
val_dataset = coco_karpathy_caption_eval(transform_test, config['image_root'], config['ann_root'], 'val')
test_dataset = coco_karpathy_caption_eval(transform_test, config['image_root'], config['ann_root'], 'test')
return train_dataset, val_dataset, test_dataset
elif dataset=='nocaps':
val_dataset = nocaps_eval(transform_test, config['image_root'], config['ann_root'], 'val')
test_dataset = nocaps_eval(transform_test, config['image_root'], config['ann_root'], 'test')
return val_dataset, test_dataset
elif dataset=='retrieval_coco':
train_dataset = coco_karpathy_train(transform_train, config['image_root'], config['ann_root'])
val_dataset = coco_karpathy_retrieval_eval(transform_test, config['image_root'], config['ann_root'], 'val')
test_dataset = coco_karpathy_retrieval_eval(transform_test, config['image_root'], config['ann_root'], 'test')
return train_dataset, val_dataset, test_dataset
elif dataset=='retrieval_flickr':
train_dataset = flickr30k_train(transform_train, config['image_root'], config['ann_root'])
val_dataset = flickr30k_retrieval_eval(transform_test, config['image_root'], config['ann_root'], 'val')
test_dataset = flickr30k_retrieval_eval(transform_test, config['image_root'], config['ann_root'], 'test')
return train_dataset, val_dataset, test_dataset
elif dataset=='vqa':
train_dataset = vqa_dataset(transform_train, config['ann_root'], config['vqa_root'], config['vg_root'],
train_files = config['train_files'], split='train')
test_dataset = vqa_dataset(transform_test, config['ann_root'], config['vqa_root'], config['vg_root'], split='test')
return train_dataset, test_dataset
elif dataset=='nlvr':
train_dataset = nlvr_dataset(transform_train, config['image_root'], config['ann_root'],'train')
val_dataset = nlvr_dataset(transform_test, config['image_root'], config['ann_root'],'val')
test_dataset = nlvr_dataset(transform_test, config['image_root'], config['ann_root'],'test')
return train_dataset, val_dataset, test_dataset
def create_sampler(datasets, shuffles, num_tasks, global_rank):
samplers = []
for dataset,shuffle in zip(datasets,shuffles):
sampler = torch.utils.data.DistributedSampler(dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle)
samplers.append(sampler)
return samplers
def create_loader(datasets, samplers, batch_size, num_workers, is_trains, collate_fns):
loaders = []
for dataset,sampler,bs,n_worker,is_train,collate_fn in zip(datasets,samplers,batch_size,num_workers,is_trains,collate_fns):
if is_train:
shuffle = (sampler is None)
drop_last = True
else:
shuffle = False
drop_last = False
loader = DataLoader(
dataset,
batch_size=bs,
num_workers=n_worker,
pin_memory=True,
sampler=sampler,
shuffle=shuffle,
collate_fn=collate_fn,
drop_last=drop_last,
)
loaders.append(loader)
return loaders

View File

@ -0,0 +1,126 @@
import os
import json
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url
from PIL import Image
from data.utils import pre_caption
class coco_karpathy_train(Dataset):
def __init__(self, transform, image_root, ann_root, max_words=30, prompt=''):
'''
image_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
'''
url = 'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json'
filename = 'coco_karpathy_train.json'
download_url(url,ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filename),'r'))
self.transform = transform
self.image_root = image_root
self.max_words = max_words
self.prompt = prompt
self.img_ids = {}
n = 0
for ann in self.annotation:
img_id = ann['image_id']
if img_id not in self.img_ids.keys():
self.img_ids[img_id] = n
n += 1
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.image_root,ann['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
caption = self.prompt+pre_caption(ann['caption'], self.max_words)
return image, caption, self.img_ids[ann['image_id']]
class coco_karpathy_caption_eval(Dataset):
def __init__(self, transform, image_root, ann_root, split):
'''
image_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
split (string): val or test
'''
urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json'}
filenames = {'val':'coco_karpathy_val.json','test':'coco_karpathy_test.json'}
download_url(urls[split],ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filenames[split]),'r'))
self.transform = transform
self.image_root = image_root
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.image_root,ann['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
img_id = ann['image'].split('/')[-1].strip('.jpg').split('_')[-1]
return image, int(img_id)
class coco_karpathy_retrieval_eval(Dataset):
def __init__(self, transform, image_root, ann_root, split, max_words=30):
'''
image_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
split (string): val or test
'''
urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json'}
filenames = {'val':'coco_karpathy_val.json','test':'coco_karpathy_test.json'}
download_url(urls[split],ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filenames[split]),'r'))
self.transform = transform
self.image_root = image_root
self.text = []
self.image = []
self.txt2img = {}
self.img2txt = {}
txt_id = 0
for img_id, ann in enumerate(self.annotation):
self.image.append(ann['image'])
self.img2txt[img_id] = []
for i, caption in enumerate(ann['caption']):
self.text.append(pre_caption(caption,max_words))
self.img2txt[img_id].append(txt_id)
self.txt2img[txt_id] = img_id
txt_id += 1
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
image_path = os.path.join(self.image_root, self.annotation[index]['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
return image, index

View File

@ -0,0 +1,93 @@
import os
import json
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url
from PIL import Image
from data.utils import pre_caption
class flickr30k_train(Dataset):
def __init__(self, transform, image_root, ann_root, max_words=30, prompt=''):
'''
image_root (string): Root directory of images (e.g. flickr30k/)
ann_root (string): directory to store the annotation file
'''
url = 'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json'
filename = 'flickr30k_train.json'
download_url(url,ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filename),'r'))
self.transform = transform
self.image_root = image_root
self.max_words = max_words
self.prompt = prompt
self.img_ids = {}
n = 0
for ann in self.annotation:
img_id = ann['image_id']
if img_id not in self.img_ids.keys():
self.img_ids[img_id] = n
n += 1
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.image_root,ann['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
caption = self.prompt+pre_caption(ann['caption'], self.max_words)
return image, caption, self.img_ids[ann['image_id']]
class flickr30k_retrieval_eval(Dataset):
def __init__(self, transform, image_root, ann_root, split, max_words=30):
'''
image_root (string): Root directory of images (e.g. flickr30k/)
ann_root (string): directory to store the annotation file
split (string): val or test
'''
urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json'}
filenames = {'val':'flickr30k_val.json','test':'flickr30k_test.json'}
download_url(urls[split],ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filenames[split]),'r'))
self.transform = transform
self.image_root = image_root
self.text = []
self.image = []
self.txt2img = {}
self.img2txt = {}
txt_id = 0
for img_id, ann in enumerate(self.annotation):
self.image.append(ann['image'])
self.img2txt[img_id] = []
for i, caption in enumerate(ann['caption']):
self.text.append(pre_caption(caption,max_words))
self.img2txt[img_id].append(txt_id)
self.txt2img[txt_id] = img_id
txt_id += 1
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
image_path = os.path.join(self.image_root, self.annotation[index]['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
return image, index

78
ldm/data/nlvr_dataset.py Normal file
View File

@ -0,0 +1,78 @@
import os
import json
import random
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url
from PIL import Image
from data.utils import pre_caption
class nlvr_dataset(Dataset):
def __init__(self, transform, image_root, ann_root, split):
'''
image_root (string): Root directory of images
ann_root (string): directory to store the annotation file
split (string): train, val or test
'''
urls = {'train':'https://storage.googleapis.com/sfr-vision-language-research/datasets/nlvr_train.json',
'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/nlvr_dev.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/nlvr_test.json'}
filenames = {'train':'nlvr_train.json','val':'nlvr_dev.json','test':'nlvr_test.json'}
download_url(urls[split],ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filenames[split]),'r'))
self.transform = transform
self.image_root = image_root
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image0_path = os.path.join(self.image_root,ann['images'][0])
image0 = Image.open(image0_path).convert('RGB')
image0 = self.transform(image0)
image1_path = os.path.join(self.image_root,ann['images'][1])
image1 = Image.open(image1_path).convert('RGB')
image1 = self.transform(image1)
sentence = pre_caption(ann['sentence'], 40)
if ann['label']=='True':
label = 1
else:
label = 0
words = sentence.split(' ')
if 'left' not in words and 'right' not in words:
if random.random()<0.5:
return image0, image1, sentence, label
else:
return image1, image0, sentence, label
else:
if random.random()<0.5:
return image0, image1, sentence, label
else:
new_words = []
for word in words:
if word=='left':
new_words.append('right')
elif word=='right':
new_words.append('left')
else:
new_words.append(word)
sentence = ' '.join(new_words)
return image1, image0, sentence, label

View File

@ -0,0 +1,32 @@
import os
import json
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url
from PIL import Image
class nocaps_eval(Dataset):
def __init__(self, transform, image_root, ann_root, split):
urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json'}
filenames = {'val':'nocaps_val.json','test':'nocaps_test.json'}
download_url(urls[split],ann_root)
self.annotation = json.load(open(os.path.join(ann_root,filenames[split]),'r'))
self.transform = transform
self.image_root = image_root
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.image_root,ann['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
return image, int(ann['img_id'])

View File

@ -0,0 +1,59 @@
import json
import os
import random
from torch.utils.data import Dataset
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None
from data.utils import pre_caption
import os,glob
class pretrain_dataset(Dataset):
def __init__(self, ann_file, laion_path, transform):
self.ann_pretrain = []
for f in ann_file:
print('loading '+f)
ann = json.load(open(f,'r'))
self.ann_pretrain += ann
self.laion_path = laion_path
if self.laion_path:
self.laion_files = glob.glob(os.path.join(laion_path,'*.json'))
print('loading '+self.laion_files[0])
with open(self.laion_files[0],'r') as f:
self.ann_laion = json.load(f)
self.annotation = self.ann_pretrain + self.ann_laion
else:
self.annotation = self.ann_pretrain
self.transform = transform
def reload_laion(self, epoch):
n = epoch%len(self.laion_files)
print('loading '+self.laion_files[n])
with open(self.laion_files[n],'r') as f:
self.ann_laion = json.load(f)
self.annotation = self.ann_pretrain + self.ann_laion
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
image = Image.open(ann['image']).convert('RGB')
image = self.transform(image)
caption = pre_caption(ann['caption'],30)
return image, caption

112
ldm/data/utils.py Normal file
View File

@ -0,0 +1,112 @@
import re
import json
import os
import torch
import torch.distributed as dist
import utils
def pre_caption(caption,max_words=50):
caption = re.sub(
r"([.!\"()*#:;~])",
' ',
caption.lower(),
)
caption = re.sub(
r"\s{2,}",
' ',
caption,
)
caption = caption.rstrip('\n')
caption = caption.strip(' ')
#truncate caption
caption_words = caption.split(' ')
if len(caption_words)>max_words:
caption = ' '.join(caption_words[:max_words])
return caption
def pre_question(question,max_ques_words=50):
question = re.sub(
r"([.!\"()*#:;~])",
'',
question.lower(),
)
question = question.rstrip(' ')
#truncate question
question_words = question.split(' ')
if len(question_words)>max_ques_words:
question = ' '.join(question_words[:max_ques_words])
return question
def save_result(result, result_dir, filename, remove_duplicate=''):
result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,utils.get_rank()))
final_result_file = os.path.join(result_dir, '%s.json'%filename)
json.dump(result,open(result_file,'w'))
dist.barrier()
if utils.is_main_process():
# combine results from all processes
result = []
for rank in range(utils.get_world_size()):
result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,rank))
res = json.load(open(result_file,'r'))
result += res
if remove_duplicate:
result_new = []
id_list = []
for res in result:
if res[remove_duplicate] not in id_list:
id_list.append(res[remove_duplicate])
result_new.append(res)
result = result_new
json.dump(result,open(final_result_file,'w'))
print('result file saved to %s'%final_result_file)
return final_result_file
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from torchvision.datasets.utils import download_url
def coco_caption_eval(coco_gt_root, results_file, split):
urls = {'val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json',
'test':'https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json'}
filenames = {'val':'coco_karpathy_val_gt.json','test':'coco_karpathy_test_gt.json'}
download_url(urls[split],coco_gt_root)
annotation_file = os.path.join(coco_gt_root,filenames[split])
# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()
# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()
# print output evaluation scores
for metric, score in coco_eval.eval.items():
print(f'{metric}: {score:.3f}')
return coco_eval

110
ldm/data/video_dataset.py Normal file
View File

@ -0,0 +1,110 @@
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url
from PIL import Image
import torch
import numpy as np
import random
import decord
from decord import VideoReader
import json
import os
from data.utils import pre_caption
decord.bridge.set_bridge("torch")
class ImageNorm(object):
"""Apply Normalization to Image Pixels on GPU
"""
def __init__(self, mean, std):
self.mean = torch.tensor(mean).view(1, 3, 1, 1)
self.std = torch.tensor(std).view(1, 3, 1, 1)
def __call__(self, img):
if torch.max(img) > 1 and self.mean.max() <= 1:
img.div_(255.)
return img.sub_(self.mean).div_(self.std)
def load_jsonl(filename):
with open(filename, "r") as f:
return [json.loads(l.strip("\n")) for l in f.readlines()]
class VideoDataset(Dataset):
def __init__(self, video_root, ann_root, num_frm=4, frm_sampling_strategy="rand", max_img_size=384, video_fmt='.mp4'):
'''
image_root (string): Root directory of video
ann_root (string): directory to store the annotation file
'''
url = 'https://storage.googleapis.com/sfr-vision-language-research/datasets/msrvtt_test.jsonl'
filename = 'msrvtt_test.jsonl'
download_url(url,ann_root)
self.annotation = load_jsonl(os.path.join(ann_root,filename))
self.num_frm = num_frm
self.frm_sampling_strategy = frm_sampling_strategy
self.max_img_size = max_img_size
self.video_root = video_root
self.video_fmt = video_fmt
self.img_norm = ImageNorm(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
self.text = [pre_caption(ann['caption'],40) for ann in self.annotation]
self.txt2video = [i for i in range(len(self.annotation))]
self.video2txt = self.txt2video
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
video_path = os.path.join(self.video_root, ann['clip_name'] + self.video_fmt)
vid_frm_array = self._load_video_from_path_decord(video_path, height=self.max_img_size, width=self.max_img_size)
video = self.img_norm(vid_frm_array.float())
return video, ann['clip_name']
def _load_video_from_path_decord(self, video_path, height=None, width=None, start_time=None, end_time=None, fps=-1):
try:
if not height or not width:
vr = VideoReader(video_path)
else:
vr = VideoReader(video_path, width=width, height=height)
vlen = len(vr)
if start_time or end_time:
assert fps > 0, 'must provide video fps if specifying start and end time.'
start_idx = min(int(start_time * fps), vlen)
end_idx = min(int(end_time * fps), vlen)
else:
start_idx, end_idx = 0, vlen
if self.frm_sampling_strategy == 'uniform':
frame_indices = np.arange(start_idx, end_idx, vlen / self.num_frm, dtype=int)
elif self.frm_sampling_strategy == 'rand':
frame_indices = sorted(random.sample(range(vlen), self.num_frm))
elif self.frm_sampling_strategy == 'headtail':
frame_indices_head = sorted(random.sample(range(vlen // 2), self.num_frm // 2))
frame_indices_tail = sorted(random.sample(range(vlen // 2, vlen), self.num_frm // 2))
frame_indices = frame_indices_head + frame_indices_tail
else:
raise NotImplementedError('Invalid sampling strategy {} '.format(self.frm_sampling_strategy))
raw_sample_frms = vr.get_batch(frame_indices)
except Exception as e:
return None
raw_sample_frms = raw_sample_frms.permute(0, 3, 1, 2)
return raw_sample_frms

88
ldm/data/vqa_dataset.py Normal file
View File

@ -0,0 +1,88 @@
import os
import json
import random
from PIL import Image
import torch
from torch.utils.data import Dataset
from data.utils import pre_question
from torchvision.datasets.utils import download_url
class vqa_dataset(Dataset):
def __init__(self, transform, ann_root, vqa_root, vg_root, train_files=[], split="train"):
self.split = split
self.transform = transform
self.vqa_root = vqa_root
self.vg_root = vg_root
if split=='train':
urls = {'vqa_train':'https://storage.googleapis.com/sfr-vision-language-research/datasets/vqa_train.json',
'vqa_val':'https://storage.googleapis.com/sfr-vision-language-research/datasets/vqa_val.json',
'vg_qa':'https://storage.googleapis.com/sfr-vision-language-research/datasets/vg_qa.json'}
self.annotation = []
for f in train_files:
download_url(urls[f],ann_root)
self.annotation += json.load(open(os.path.join(ann_root,'%s.json'%f),'r'))
else:
download_url('https://storage.googleapis.com/sfr-vision-language-research/datasets/vqa_test.json',ann_root)
self.annotation = json.load(open(os.path.join(ann_root,'vqa_test.json'),'r'))
download_url('https://storage.googleapis.com/sfr-vision-language-research/datasets/answer_list.json',ann_root)
self.answer_list = json.load(open(os.path.join(ann_root,'answer_list.json'),'r'))
def __len__(self):
return len(self.annotation)
def __getitem__(self, index):
ann = self.annotation[index]
if ann['dataset']=='vqa':
image_path = os.path.join(self.vqa_root,ann['image'])
elif ann['dataset']=='vg':
image_path = os.path.join(self.vg_root,ann['image'])
image = Image.open(image_path).convert('RGB')
image = self.transform(image)
if self.split == 'test':
question = pre_question(ann['question'])
question_id = ann['question_id']
return image, question, question_id
elif self.split=='train':
question = pre_question(ann['question'])
if ann['dataset']=='vqa':
answer_weight = {}
for answer in ann['answer']:
if answer in answer_weight.keys():
answer_weight[answer] += 1/len(ann['answer'])
else:
answer_weight[answer] = 1/len(ann['answer'])
answers = list(answer_weight.keys())
weights = list(answer_weight.values())
elif ann['dataset']=='vg':
answers = [ann['answer']]
weights = [0.2]
return image, question, answers, weights
def vqa_collate_fn(batch):
image_list, question_list, answer_list, weight_list, n = [], [], [], [], []
for image, question, answer, weights in batch:
image_list.append(image)
question_list.append(question)
weight_list += weights
answer_list += answer
n.append(len(answer))
return torch.stack(image_list,dim=0), question_list, answer_list, torch.Tensor(weight_list), n

0
ldm/models/__init__.py Normal file
View File

238
ldm/models/blip.py Normal file
View File

@ -0,0 +1,238 @@
'''
* Copyright (c) 2022, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
* By Junnan Li
'''
import warnings
warnings.filterwarnings("ignore")
from .vit import VisionTransformer, interpolate_pos_embed
from .med import BertConfig, BertModel, BertLMHeadModel
from transformers import BertTokenizer
import torch
from torch import nn
#import torch.nn.functional as F
import os
from urllib.parse import urlparse
from timm.models.hub import download_cached_file
class BLIP_Base(nn.Module):
def __init__(self,
med_config = 'configs/blip/med_config.json',
image_size = 224,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
self.tokenizer = init_tokenizer()
med_config = BertConfig.from_json_file(med_config)
med_config.encoder_width = vision_width
self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
def forward(self, image, caption, mode):
assert mode in ['image', 'text', 'multimodal'], "mode parameter must be image, text, or multimodal"
text = self.tokenizer(caption, return_tensors="pt").to(image.device)
if mode=='image':
# return image features
image_embeds = self.visual_encoder(image)
return image_embeds
elif mode=='text':
# return text features
text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
return text_output.last_hidden_state
elif mode=='multimodal':
# return multimodel features
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
text.input_ids[:,0] = self.tokenizer.enc_token_id
output = self.text_encoder(text.input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True,
)
return output.last_hidden_state
class BLIP_Decoder(nn.Module):
def __init__(self,
med_config = 'configs/blip/med_config.json',
image_size = 384,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
prompt = 'a picture of ',
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
self.tokenizer = init_tokenizer()
med_config = BertConfig.from_json_file(med_config)
med_config.encoder_width = vision_width
self.text_decoder = BertLMHeadModel(config=med_config)
self.prompt = prompt
self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1
def forward(self, image, caption):
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to(image.device)
text.input_ids[:,0] = self.tokenizer.bos_token_id
decoder_targets = text.input_ids.masked_fill(text.input_ids == self.tokenizer.pad_token_id, -100)
decoder_targets[:,:self.prompt_length] = -100
decoder_output = self.text_decoder(text.input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
labels = decoder_targets,
return_dict = True,
)
loss_lm = decoder_output.loss
return loss_lm
def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
image_embeds = self.visual_encoder(image)
if not sample:
image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask":image_atts}
prompt = [self.prompt] * image.size(0)
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device)
input_ids[:,0] = self.tokenizer.bos_token_id
input_ids = input_ids[:, :-1]
if sample:
#nucleus sampling
outputs = self.text_decoder.generate(input_ids=input_ids,
max_length=max_length,
min_length=min_length,
do_sample=True,
top_p=top_p,
num_return_sequences=1,
eos_token_id=self.tokenizer.sep_token_id,
pad_token_id=self.tokenizer.pad_token_id,
repetition_penalty=1.1,
**model_kwargs)
else:
#beam search
outputs = self.text_decoder.generate(input_ids=input_ids,
max_length=max_length,
min_length=min_length,
num_beams=num_beams,
eos_token_id=self.tokenizer.sep_token_id,
pad_token_id=self.tokenizer.pad_token_id,
repetition_penalty=repetition_penalty,
**model_kwargs)
captions = []
for output in outputs:
caption = self.tokenizer.decode(output, skip_special_tokens=True)
captions.append(caption[len(self.prompt):])
return captions
def blip_decoder(pretrained='',**kwargs):
model = BLIP_Decoder(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
assert(len(msg.missing_keys)==0)
return model
def blip_feature_extractor(pretrained='',**kwargs):
model = BLIP_Base(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
assert(len(msg.missing_keys)==0)
return model
def init_tokenizer():
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'bos_token':'[DEC]'})
tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
return tokenizer
def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
assert vit in ['base', 'large'], "vit parameter must be base or large"
if vit=='base':
vision_width = 768
visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
drop_path_rate=0 or drop_path_rate
)
elif vit=='large':
vision_width = 1024
visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
drop_path_rate=0.1 or drop_path_rate
)
return visual_encoder, vision_width
def is_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https")
def load_checkpoint(model,url_or_filename):
if is_url(url_or_filename):
cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
checkpoint = torch.load(cached_file, map_location='cpu')
elif os.path.isfile(url_or_filename):
checkpoint = torch.load(url_or_filename, map_location='cpu')
else:
raise RuntimeError('checkpoint url or path is invalid')
state_dict = checkpoint['model']
state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
model.visual_encoder_m)
for key in model.state_dict().keys():
if key in state_dict.keys():
if state_dict[key].shape!=model.state_dict()[key].shape:
del state_dict[key]
msg = model.load_state_dict(state_dict,strict=False)
print('load checkpoint from %s'%url_or_filename)
return model,msg

76
ldm/models/blip_itm.py Normal file
View File

@ -0,0 +1,76 @@
from models.med import BertConfig, BertModel
from transformers import BertTokenizer
import torch
from torch import nn
import torch.nn.functional as F
from models.blip import create_vit, init_tokenizer, load_checkpoint
class BLIP_ITM(nn.Module):
def __init__(self,
med_config = 'configs/med_config.json',
image_size = 384,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
embed_dim = 256,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
self.tokenizer = init_tokenizer()
med_config = BertConfig.from_json_file(med_config)
med_config.encoder_width = vision_width
self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
text_width = self.text_encoder.config.hidden_size
self.vision_proj = nn.Linear(vision_width, embed_dim)
self.text_proj = nn.Linear(text_width, embed_dim)
self.itm_head = nn.Linear(text_width, 2)
def forward(self, image, caption, match_head='itm'):
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=35,
return_tensors="pt").to(image.device)
if match_head=='itm':
output = self.text_encoder(text.input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True,
)
itm_output = self.itm_head(output.last_hidden_state[:,0,:])
return itm_output
elif match_head=='itc':
text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
sim = image_feat @ text_feat.t()
return sim
def blip_itm(pretrained='',**kwargs):
model = BLIP_ITM(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
assert(len(msg.missing_keys)==0)
return model

103
ldm/models/blip_nlvr.py Normal file
View File

@ -0,0 +1,103 @@
from models.med import BertConfig
from models.nlvr_encoder import BertModel
from models.vit import interpolate_pos_embed
from models.blip import create_vit, init_tokenizer, is_url
from timm.models.hub import download_cached_file
import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertTokenizer
import numpy as np
class BLIP_NLVR(nn.Module):
def __init__(self,
med_config = 'configs/med_config.json',
image_size = 480,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, drop_path_rate=0.1)
self.tokenizer = init_tokenizer()
med_config = BertConfig.from_json_file(med_config)
med_config.encoder_width = vision_width
self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
self.cls_head = nn.Sequential(
nn.Linear(self.text_encoder.config.hidden_size, self.text_encoder.config.hidden_size),
nn.ReLU(),
nn.Linear(self.text_encoder.config.hidden_size, 2)
)
def forward(self, image, text, targets, train=True):
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
image0_embeds, image1_embeds = torch.split(image_embeds,targets.size(0))
text = self.tokenizer(text, padding='longest', return_tensors="pt").to(image.device)
text.input_ids[:,0] = self.tokenizer.enc_token_id
output = self.text_encoder(text.input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = [image0_embeds,image1_embeds],
encoder_attention_mask = [image_atts[:image0_embeds.size(0)],
image_atts[image0_embeds.size(0):]],
return_dict = True,
)
hidden_state = output.last_hidden_state[:,0,:]
prediction = self.cls_head(hidden_state)
if train:
loss = F.cross_entropy(prediction, targets)
return loss
else:
return prediction
def blip_nlvr(pretrained='',**kwargs):
model = BLIP_NLVR(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
print("missing keys:")
print(msg.missing_keys)
return model
def load_checkpoint(model,url_or_filename):
if is_url(url_or_filename):
cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
checkpoint = torch.load(cached_file, map_location='cpu')
elif os.path.isfile(url_or_filename):
checkpoint = torch.load(url_or_filename, map_location='cpu')
else:
raise RuntimeError('checkpoint url or path is invalid')
state_dict = checkpoint['model']
state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
for key in list(state_dict.keys()):
if 'crossattention.self.' in key:
new_key0 = key.replace('self','self0')
new_key1 = key.replace('self','self1')
state_dict[new_key0] = state_dict[key]
state_dict[new_key1] = state_dict[key]
elif 'crossattention.output.dense.' in key:
new_key0 = key.replace('dense','dense0')
new_key1 = key.replace('dense','dense1')
state_dict[new_key0] = state_dict[key]
state_dict[new_key1] = state_dict[key]
msg = model.load_state_dict(state_dict,strict=False)
print('load checkpoint from %s'%url_or_filename)
return model,msg

339
ldm/models/blip_pretrain.py Normal file
View File

@ -0,0 +1,339 @@
'''
* Copyright (c) 2022, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
* By Junnan Li
'''
from models.med import BertConfig, BertModel, BertLMHeadModel
from transformers import BertTokenizer
import transformers
transformers.logging.set_verbosity_error()
import torch
from torch import nn
import torch.nn.functional as F
from models.blip import create_vit, init_tokenizer, load_checkpoint
class BLIP_Pretrain(nn.Module):
def __init__(self,
med_config = 'configs/bert_config.json',
image_size = 224,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
embed_dim = 256,
queue_size = 57600,
momentum = 0.995,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, 0)
if vit=='base':
checkpoint = torch.hub.load_state_dict_from_url(
url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth",
map_location="cpu", check_hash=True)
state_dict = checkpoint["model"]
msg = self.visual_encoder.load_state_dict(state_dict,strict=False)
elif vit=='large':
from timm.models.helpers import load_custom_pretrained
from timm.models.vision_transformer import default_cfgs
load_custom_pretrained(self.visual_encoder,default_cfgs['vit_large_patch16_224_in21k'])
self.tokenizer = init_tokenizer()
encoder_config = BertConfig.from_json_file(med_config)
encoder_config.encoder_width = vision_width
self.text_encoder = BertModel.from_pretrained('bert-base-uncased',config=encoder_config, add_pooling_layer=False)
self.text_encoder.resize_token_embeddings(len(self.tokenizer))
text_width = self.text_encoder.config.hidden_size
self.vision_proj = nn.Linear(vision_width, embed_dim)
self.text_proj = nn.Linear(text_width, embed_dim)
self.itm_head = nn.Linear(text_width, 2)
# create momentum encoders
self.visual_encoder_m, vision_width = create_vit(vit,image_size)
self.vision_proj_m = nn.Linear(vision_width, embed_dim)
self.text_encoder_m = BertModel(config=encoder_config, add_pooling_layer=False)
self.text_proj_m = nn.Linear(text_width, embed_dim)
self.model_pairs = [[self.visual_encoder,self.visual_encoder_m],
[self.vision_proj,self.vision_proj_m],
[self.text_encoder,self.text_encoder_m],
[self.text_proj,self.text_proj_m],
]
self.copy_params()
# create the queue
self.register_buffer("image_queue", torch.randn(embed_dim, queue_size))
self.register_buffer("text_queue", torch.randn(embed_dim, queue_size))
self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
self.image_queue = nn.functional.normalize(self.image_queue, dim=0)
self.text_queue = nn.functional.normalize(self.text_queue, dim=0)
self.queue_size = queue_size
self.momentum = momentum
self.temp = nn.Parameter(0.07*torch.ones([]))
# create the decoder
decoder_config = BertConfig.from_json_file(med_config)
decoder_config.encoder_width = vision_width
self.text_decoder = BertLMHeadModel.from_pretrained('bert-base-uncased',config=decoder_config)
self.text_decoder.resize_token_embeddings(len(self.tokenizer))
tie_encoder_decoder_weights(self.text_encoder,self.text_decoder.bert,'','/attention')
def forward(self, image, caption, alpha):
with torch.no_grad():
self.temp.clamp_(0.001,0.5)
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=30,
return_tensors="pt").to(image.device)
text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
# get momentum features
with torch.no_grad():
self._momentum_update()
image_embeds_m = self.visual_encoder_m(image)
image_feat_m = F.normalize(self.vision_proj_m(image_embeds_m[:,0,:]),dim=-1)
image_feat_all = torch.cat([image_feat_m.t(),self.image_queue.clone().detach()],dim=1)
text_output_m = self.text_encoder_m(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
text_feat_m = F.normalize(self.text_proj_m(text_output_m.last_hidden_state[:,0,:]),dim=-1)
text_feat_all = torch.cat([text_feat_m.t(),self.text_queue.clone().detach()],dim=1)
sim_i2t_m = image_feat_m @ text_feat_all / self.temp
sim_t2i_m = text_feat_m @ image_feat_all / self.temp
sim_targets = torch.zeros(sim_i2t_m.size()).to(image.device)
sim_targets.fill_diagonal_(1)
sim_i2t_targets = alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets
sim_t2i_targets = alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets
sim_i2t = image_feat @ text_feat_all / self.temp
sim_t2i = text_feat @ image_feat_all / self.temp
loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_i2t_targets,dim=1).mean()
loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_t2i_targets,dim=1).mean()
loss_ita = (loss_i2t+loss_t2i)/2
self._dequeue_and_enqueue(image_feat_m, text_feat_m)
###============== Image-text Matching ===================###
encoder_input_ids = text.input_ids.clone()
encoder_input_ids[:,0] = self.tokenizer.enc_token_id
# forward the positve image-text pair
bs = image.size(0)
output_pos = self.text_encoder(encoder_input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True,
)
with torch.no_grad():
weights_t2i = F.softmax(sim_t2i[:,:bs],dim=1)+1e-4
weights_t2i.fill_diagonal_(0)
weights_i2t = F.softmax(sim_i2t[:,:bs],dim=1)+1e-4
weights_i2t.fill_diagonal_(0)
# select a negative image for each text
image_embeds_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_t2i[b], 1).item()
image_embeds_neg.append(image_embeds[neg_idx])
image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
# select a negative text for each image
text_ids_neg = []
text_atts_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_i2t[b], 1).item()
text_ids_neg.append(encoder_input_ids[neg_idx])
text_atts_neg.append(text.attention_mask[neg_idx])
text_ids_neg = torch.stack(text_ids_neg,dim=0)
text_atts_neg = torch.stack(text_atts_neg,dim=0)
text_ids_all = torch.cat([encoder_input_ids, text_ids_neg],dim=0)
text_atts_all = torch.cat([text.attention_mask, text_atts_neg],dim=0)
image_embeds_all = torch.cat([image_embeds_neg,image_embeds],dim=0)
image_atts_all = torch.cat([image_atts,image_atts],dim=0)
output_neg = self.text_encoder(text_ids_all,
attention_mask = text_atts_all,
encoder_hidden_states = image_embeds_all,
encoder_attention_mask = image_atts_all,
return_dict = True,
)
vl_embeddings = torch.cat([output_pos.last_hidden_state[:,0,:], output_neg.last_hidden_state[:,0,:]],dim=0)
vl_output = self.itm_head(vl_embeddings)
itm_labels = torch.cat([torch.ones(bs,dtype=torch.long),torch.zeros(2*bs,dtype=torch.long)],
dim=0).to(image.device)
loss_itm = F.cross_entropy(vl_output, itm_labels)
##================= LM ========================##
decoder_input_ids = text.input_ids.clone()
decoder_input_ids[:,0] = self.tokenizer.bos_token_id
decoder_targets = decoder_input_ids.masked_fill(decoder_input_ids == self.tokenizer.pad_token_id, -100)
decoder_output = self.text_decoder(decoder_input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
labels = decoder_targets,
return_dict = True,
)
loss_lm = decoder_output.loss
return loss_ita, loss_itm, loss_lm
@torch.no_grad()
def copy_params(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
param_m.data.copy_(param.data) # initialize
param_m.requires_grad = False # not update by gradient
@torch.no_grad()
def _momentum_update(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
param_m.data = param_m.data * self.momentum + param.data * (1. - self.momentum)
@torch.no_grad()
def _dequeue_and_enqueue(self, image_feat, text_feat):
# gather keys before updating queue
image_feats = concat_all_gather(image_feat)
text_feats = concat_all_gather(text_feat)
batch_size = image_feats.shape[0]
ptr = int(self.queue_ptr)
assert self.queue_size % batch_size == 0 # for simplicity
# replace the keys at ptr (dequeue and enqueue)
self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
ptr = (ptr + batch_size) % self.queue_size # move pointer
self.queue_ptr[0] = ptr
def blip_pretrain(**kwargs):
model = BLIP_Pretrain(**kwargs)
return model
@torch.no_grad()
def concat_all_gather(tensor):
"""
Performs all_gather operation on the provided tensors.
*** Warning ***: torch.distributed.all_gather has no gradient.
"""
tensors_gather = [torch.ones_like(tensor)
for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
output = torch.cat(tensors_gather, dim=0)
return output
from typing import List
def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key:str):
uninitialized_encoder_weights: List[str] = []
if decoder.__class__ != encoder.__class__:
logger.info(
f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
)
def tie_encoder_to_decoder_recursively(
decoder_pointer: nn.Module,
encoder_pointer: nn.Module,
module_name: str,
uninitialized_encoder_weights: List[str],
skip_key: str,
depth=0,
):
assert isinstance(decoder_pointer, nn.Module) and isinstance(
encoder_pointer, nn.Module
), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
assert hasattr(encoder_pointer, "weight")
encoder_pointer.weight = decoder_pointer.weight
if hasattr(decoder_pointer, "bias"):
assert hasattr(encoder_pointer, "bias")
encoder_pointer.bias = decoder_pointer.bias
print(module_name+' is tied')
return
encoder_modules = encoder_pointer._modules
decoder_modules = decoder_pointer._modules
if len(decoder_modules) > 0:
assert (
len(encoder_modules) > 0
), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
encoder_layer_pos = 0
for name, module in decoder_modules.items():
if name.isdigit():
encoder_name = str(int(name) + encoder_layer_pos)
decoder_name = name
if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
encoder_modules
) != len(decoder_modules):
# this can happen if the name corresponds to the position in a list module list of layers
# in this case the decoder has added a cross-attention that the encoder does not have
# thus skip this step and subtract one layer pos from encoder
encoder_layer_pos -= 1
continue
elif name not in encoder_modules:
continue
elif depth > 500:
raise ValueError(
"Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
)
else:
decoder_name = encoder_name = name
tie_encoder_to_decoder_recursively(
decoder_modules[decoder_name],
encoder_modules[encoder_name],
module_name + "/" + name,
uninitialized_encoder_weights,
skip_key,
depth=depth + 1,
)
all_encoder_weights.remove(module_name + "/" + encoder_name)
uninitialized_encoder_weights += list(all_encoder_weights)
# tie weights recursively
tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key)

View File

@ -0,0 +1,319 @@
from models.med import BertConfig, BertModel
from transformers import BertTokenizer
import torch
from torch import nn
import torch.nn.functional as F
from models.blip import create_vit, init_tokenizer, load_checkpoint
class BLIP_Retrieval(nn.Module):
def __init__(self,
med_config = 'configs/med_config.json',
image_size = 384,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
embed_dim = 256,
queue_size = 57600,
momentum = 0.995,
negative_all_rank = False,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
self.tokenizer = init_tokenizer()
med_config = BertConfig.from_json_file(med_config)
med_config.encoder_width = vision_width
self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
text_width = self.text_encoder.config.hidden_size
self.vision_proj = nn.Linear(vision_width, embed_dim)
self.text_proj = nn.Linear(text_width, embed_dim)
self.itm_head = nn.Linear(text_width, 2)
# create momentum encoders
self.visual_encoder_m, vision_width = create_vit(vit,image_size)
self.vision_proj_m = nn.Linear(vision_width, embed_dim)
self.text_encoder_m = BertModel(config=med_config, add_pooling_layer=False)
self.text_proj_m = nn.Linear(text_width, embed_dim)
self.model_pairs = [[self.visual_encoder,self.visual_encoder_m],
[self.vision_proj,self.vision_proj_m],
[self.text_encoder,self.text_encoder_m],
[self.text_proj,self.text_proj_m],
]
self.copy_params()
# create the queue
self.register_buffer("image_queue", torch.randn(embed_dim, queue_size))
self.register_buffer("text_queue", torch.randn(embed_dim, queue_size))
self.register_buffer("idx_queue", torch.full((1,queue_size),-100))
self.register_buffer("ptr_queue", torch.zeros(1, dtype=torch.long))
self.image_queue = nn.functional.normalize(self.image_queue, dim=0)
self.text_queue = nn.functional.normalize(self.text_queue, dim=0)
self.queue_size = queue_size
self.momentum = momentum
self.temp = nn.Parameter(0.07*torch.ones([]))
self.negative_all_rank = negative_all_rank
def forward(self, image, caption, alpha, idx):
with torch.no_grad():
self.temp.clamp_(0.001,0.5)
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=35,
return_tensors="pt").to(image.device)
text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
###============== Image-text Contrastive Learning ===================###
idx = idx.view(-1,1)
idx_all = torch.cat([idx.t(), self.idx_queue.clone().detach()],dim=1)
pos_idx = torch.eq(idx, idx_all).float()
sim_targets = pos_idx / pos_idx.sum(1,keepdim=True)
# get momentum features
with torch.no_grad():
self._momentum_update()
image_embeds_m = self.visual_encoder_m(image)
image_feat_m = F.normalize(self.vision_proj_m(image_embeds_m[:,0,:]),dim=-1)
image_feat_m_all = torch.cat([image_feat_m.t(),self.image_queue.clone().detach()],dim=1)
text_output_m = self.text_encoder_m(text.input_ids, attention_mask = text.attention_mask,
return_dict = True, mode = 'text')
text_feat_m = F.normalize(self.text_proj_m(text_output_m.last_hidden_state[:,0,:]),dim=-1)
text_feat_m_all = torch.cat([text_feat_m.t(),self.text_queue.clone().detach()],dim=1)
sim_i2t_m = image_feat_m @ text_feat_m_all / self.temp
sim_t2i_m = text_feat_m @ image_feat_m_all / self.temp
sim_i2t_targets = alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets
sim_t2i_targets = alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets
sim_i2t = image_feat @ text_feat_m_all / self.temp
sim_t2i = text_feat @ image_feat_m_all / self.temp
loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_i2t_targets,dim=1).mean()
loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_t2i_targets,dim=1).mean()
loss_ita = (loss_i2t+loss_t2i)/2
idxs = concat_all_gather(idx)
self._dequeue_and_enqueue(image_feat_m, text_feat_m, idxs)
###============== Image-text Matching ===================###
encoder_input_ids = text.input_ids.clone()
encoder_input_ids[:,0] = self.tokenizer.enc_token_id
# forward the positve image-text pair
bs = image.size(0)
output_pos = self.text_encoder(encoder_input_ids,
attention_mask = text.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True,
)
if self.negative_all_rank:
# compute sample similarity
with torch.no_grad():
mask = torch.eq(idx, idxs.t())
image_feat_world = concat_all_gather(image_feat)
text_feat_world = concat_all_gather(text_feat)
sim_i2t = image_feat @ text_feat_world.t() / self.temp
sim_t2i = text_feat @ image_feat_world.t() / self.temp
weights_i2t = F.softmax(sim_i2t,dim=1)
weights_i2t.masked_fill_(mask, 0)
weights_t2i = F.softmax(sim_t2i,dim=1)
weights_t2i.masked_fill_(mask, 0)
image_embeds_world = all_gather_with_grad(image_embeds)
# select a negative image (from all ranks) for each text
image_embeds_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_t2i[b], 1).item()
image_embeds_neg.append(image_embeds_world[neg_idx])
image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
# select a negative text (from all ranks) for each image
input_ids_world = concat_all_gather(encoder_input_ids)
att_mask_world = concat_all_gather(text.attention_mask)
text_ids_neg = []
text_atts_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_i2t[b], 1).item()
text_ids_neg.append(input_ids_world[neg_idx])
text_atts_neg.append(att_mask_world[neg_idx])
else:
with torch.no_grad():
mask = torch.eq(idx, idx.t())
sim_i2t = image_feat @ text_feat.t() / self.temp
sim_t2i = text_feat @ image_feat.t() / self.temp
weights_i2t = F.softmax(sim_i2t,dim=1)
weights_i2t.masked_fill_(mask, 0)
weights_t2i = F.softmax(sim_t2i,dim=1)
weights_t2i.masked_fill_(mask, 0)
# select a negative image (from same rank) for each text
image_embeds_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_t2i[b], 1).item()
image_embeds_neg.append(image_embeds[neg_idx])
image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
# select a negative text (from same rank) for each image
text_ids_neg = []
text_atts_neg = []
for b in range(bs):
neg_idx = torch.multinomial(weights_i2t[b], 1).item()
text_ids_neg.append(encoder_input_ids[neg_idx])
text_atts_neg.append(text.attention_mask[neg_idx])
text_ids_neg = torch.stack(text_ids_neg,dim=0)
text_atts_neg = torch.stack(text_atts_neg,dim=0)
text_ids_all = torch.cat([encoder_input_ids, text_ids_neg],dim=0)
text_atts_all = torch.cat([text.attention_mask, text_atts_neg],dim=0)
image_embeds_all = torch.cat([image_embeds_neg,image_embeds],dim=0)
image_atts_all = torch.cat([image_atts,image_atts],dim=0)
output_neg = self.text_encoder(text_ids_all,
attention_mask = text_atts_all,
encoder_hidden_states = image_embeds_all,
encoder_attention_mask = image_atts_all,
return_dict = True,
)
vl_embeddings = torch.cat([output_pos.last_hidden_state[:,0,:], output_neg.last_hidden_state[:,0,:]],dim=0)
vl_output = self.itm_head(vl_embeddings)
itm_labels = torch.cat([torch.ones(bs,dtype=torch.long),torch.zeros(2*bs,dtype=torch.long)],
dim=0).to(image.device)
loss_itm = F.cross_entropy(vl_output, itm_labels)
return loss_ita, loss_itm
@torch.no_grad()
def copy_params(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
param_m.data.copy_(param.data) # initialize
param_m.requires_grad = False # not update by gradient
@torch.no_grad()
def _momentum_update(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
param_m.data = param_m.data * self.momentum + param.data * (1. - self.momentum)
@torch.no_grad()
def _dequeue_and_enqueue(self, image_feat, text_feat, idxs):
# gather keys before updating queue
image_feats = concat_all_gather(image_feat)
text_feats = concat_all_gather(text_feat)
batch_size = image_feats.shape[0]
ptr = int(self.ptr_queue)
assert self.queue_size % batch_size == 0 # for simplicity
# replace the keys at ptr (dequeue and enqueue)
self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
self.idx_queue[:, ptr:ptr + batch_size] = idxs.T
ptr = (ptr + batch_size) % self.queue_size # move pointer
self.ptr_queue[0] = ptr
def blip_retrieval(pretrained='',**kwargs):
model = BLIP_Retrieval(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
print("missing keys:")
print(msg.missing_keys)
return model
@torch.no_grad()
def concat_all_gather(tensor):
"""
Performs all_gather operation on the provided tensors.
*** Warning ***: torch.distributed.all_gather has no gradient.
"""
tensors_gather = [torch.ones_like(tensor)
for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
output = torch.cat(tensors_gather, dim=0)
return output
class GatherLayer(torch.autograd.Function):
"""
Gather tensors from all workers with support for backward propagation:
This implementation does not cut the gradients as torch.distributed.all_gather does.
"""
@staticmethod
def forward(ctx, x):
output = [torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather(output, x)
return tuple(output)
@staticmethod
def backward(ctx, *grads):
all_gradients = torch.stack(grads)
torch.distributed.all_reduce(all_gradients)
return all_gradients[torch.distributed.get_rank()]
def all_gather_with_grad(tensors):
"""
Performs all_gather operation on the provided tensors.
Graph remains connected for backward grad computation.
"""
# Queue the gathered tensors
world_size = torch.distributed.get_world_size()
# There is no need for reduction in the single-proc case
if world_size == 1:
return tensors
tensor_all = GatherLayer.apply(tensors)
return torch.cat(tensor_all, dim=0)

186
ldm/models/blip_vqa.py Normal file
View File

@ -0,0 +1,186 @@
from models.med import BertConfig, BertModel, BertLMHeadModel
from models.blip import create_vit, init_tokenizer, load_checkpoint
import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertTokenizer
import numpy as np
class BLIP_VQA(nn.Module):
def __init__(self,
med_config = 'configs/med_config.json',
image_size = 480,
vit = 'base',
vit_grad_ckpt = False,
vit_ckpt_layer = 0,
):
"""
Args:
med_config (str): path for the mixture of encoder-decoder model's configuration file
image_size (int): input image size
vit (str): model size of vision transformer
"""
super().__init__()
self.visual_encoder, vision_width = create_vit(vit, image_size, vit_grad_ckpt, vit_ckpt_layer, drop_path_rate=0.1)
self.tokenizer = init_tokenizer()
encoder_config = BertConfig.from_json_file(med_config)
encoder_config.encoder_width = vision_width
self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
decoder_config = BertConfig.from_json_file(med_config)
self.text_decoder = BertLMHeadModel(config=decoder_config)
def forward(self, image, question, answer=None, n=None, weights=None, train=True, inference='rank', k_test=128):
image_embeds = self.visual_encoder(image)
image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
question = self.tokenizer(question, padding='longest', truncation=True, max_length=35,
return_tensors="pt").to(image.device)
question.input_ids[:,0] = self.tokenizer.enc_token_id
if train:
'''
n: number of answers for each question
weights: weight for each answer
'''
answer = self.tokenizer(answer, padding='longest', return_tensors="pt").to(image.device)
answer.input_ids[:,0] = self.tokenizer.bos_token_id
answer_targets = answer.input_ids.masked_fill(answer.input_ids == self.tokenizer.pad_token_id, -100)
question_output = self.text_encoder(question.input_ids,
attention_mask = question.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True)
question_states = []
question_atts = []
for b, n in enumerate(n):
question_states += [question_output.last_hidden_state[b]]*n
question_atts += [question.attention_mask[b]]*n
question_states = torch.stack(question_states,0)
question_atts = torch.stack(question_atts,0)
answer_output = self.text_decoder(answer.input_ids,
attention_mask = answer.attention_mask,
encoder_hidden_states = question_states,
encoder_attention_mask = question_atts,
labels = answer_targets,
return_dict = True,
reduction = 'none',
)
loss = weights * answer_output.loss
loss = loss.sum()/image.size(0)
return loss
else:
question_output = self.text_encoder(question.input_ids,
attention_mask = question.attention_mask,
encoder_hidden_states = image_embeds,
encoder_attention_mask = image_atts,
return_dict = True)
if inference=='generate':
num_beams = 3
question_states = question_output.last_hidden_state.repeat_interleave(num_beams,dim=0)
question_atts = torch.ones(question_states.size()[:-1],dtype=torch.long).to(question_states.device)
model_kwargs = {"encoder_hidden_states": question_states, "encoder_attention_mask":question_atts}
bos_ids = torch.full((image.size(0),1),fill_value=self.tokenizer.bos_token_id,device=image.device)
outputs = self.text_decoder.generate(input_ids=bos_ids,
max_length=10,
min_length=1,
num_beams=num_beams,
eos_token_id=self.tokenizer.sep_token_id,
pad_token_id=self.tokenizer.pad_token_id,
**model_kwargs)
answers = []
for output in outputs:
answer = self.tokenizer.decode(output, skip_special_tokens=True)
answers.append(answer)
return answers
elif inference=='rank':
max_ids = self.rank_answer(question_output.last_hidden_state, question.attention_mask,
answer.input_ids, answer.attention_mask, k_test)
return max_ids
def rank_answer(self, question_states, question_atts, answer_ids, answer_atts, k):
num_ques = question_states.size(0)
start_ids = answer_ids[0,0].repeat(num_ques,1) # bos token
start_output = self.text_decoder(start_ids,
encoder_hidden_states = question_states,
encoder_attention_mask = question_atts,
return_dict = True,
reduction = 'none')
logits = start_output.logits[:,0,:] # first token's logit
# topk_probs: top-k probability
# topk_ids: [num_question, k]
answer_first_token = answer_ids[:,1]
prob_first_token = F.softmax(logits,dim=1).index_select(dim=1, index=answer_first_token)
topk_probs, topk_ids = prob_first_token.topk(k,dim=1)
# answer input: [num_question*k, answer_len]
input_ids = []
input_atts = []
for b, topk_id in enumerate(topk_ids):
input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
input_ids = torch.cat(input_ids,dim=0)
input_atts = torch.cat(input_atts,dim=0)
targets_ids = input_ids.masked_fill(input_ids == self.tokenizer.pad_token_id, -100)
# repeat encoder's output for top-k answers
question_states = tile(question_states, 0, k)
question_atts = tile(question_atts, 0, k)
output = self.text_decoder(input_ids,
attention_mask = input_atts,
encoder_hidden_states = question_states,
encoder_attention_mask = question_atts,
labels = targets_ids,
return_dict = True,
reduction = 'none')
log_probs_sum = -output.loss
log_probs_sum = log_probs_sum.view(num_ques,k)
max_topk_ids = log_probs_sum.argmax(dim=1)
max_ids = topk_ids[max_topk_ids>=0,max_topk_ids]
return max_ids
def blip_vqa(pretrained='',**kwargs):
model = BLIP_VQA(**kwargs)
if pretrained:
model,msg = load_checkpoint(model,pretrained)
# assert(len(msg.missing_keys)==0)
return model
def tile(x, dim, n_tile):
init_dim = x.size(dim)
repeat_idx = [1] * x.dim()
repeat_idx[dim] = n_tile
x = x.repeat(*(repeat_idx))
order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
return torch.index_select(x, dim, order_index.to(x.device))

955
ldm/models/med.py Normal file
View File

@ -0,0 +1,955 @@
'''
* Copyright (c) 2022, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
* By Junnan Li
* Based on huggingface code base
* https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
'''
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from torch import Tensor, device, dtype, nn
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from transformers.activations import ACT2FN
from transformers.file_utils import (
ModelOutput,
)
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
NextSentencePredictorOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from transformers.modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
logger = logging.get_logger(__name__)
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.config = config
def forward(
self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
embeddings = inputs_embeds
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertSelfAttention(nn.Module):
def __init__(self, config, is_cross_attention):
super().__init__()
self.config = config
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
if is_cross_attention:
self.key = nn.Linear(config.encoder_width, self.all_head_size)
self.value = nn.Linear(config.encoder_width, self.all_head_size)
else:
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.save_attention = False
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
seq_length = hidden_states.size()[1]
position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
if is_cross_attention and self.save_attention:
self.save_attention_map(attention_probs)
attention_probs.register_hook(self.save_attn_gradients)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs_dropped = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs_dropped = attention_probs_dropped * head_mask
context_layer = torch.matmul(attention_probs_dropped, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
outputs = outputs + (past_key_value,)
return outputs
class BertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertAttention(nn.Module):
def __init__(self, config, is_cross_attention=False):
super().__init__()
self.self = BertSelfAttention(config, is_cross_attention)
self.output = BertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class BertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertLayer(nn.Module):
def __init__(self, config, layer_num):
super().__init__()
self.config = config
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BertAttention(config)
self.layer_num = layer_num
if self.config.add_cross_attention:
self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
mode=None,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
if mode=='multimodal':
assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
mode='multimodal',
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warn(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, past_key_value, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
mode=mode,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
mode=mode,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class BertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertConfig
base_model_prefix = "bert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
class BertModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (:obj:`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (:obj:`Tuple[int]`):
The shape of the input to the model.
device: (:obj:`torch.device`):
The device of the input to the model.
Returns:
:obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
"""
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if is_decoder:
batch_size, seq_length = input_shape
seq_ids = torch.arange(seq_length, device=device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
# in case past_key_values are used we need to add a prefix ones mask to the causal mask
# causal and attention masks must have same type with pytorch version < 1.3
causal_mask = causal_mask.to(attention_mask.dtype)
if causal_mask.shape[1] < attention_mask.shape[1]:
prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
causal_mask = torch.cat(
[
torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
causal_mask,
],
axis=-1,
)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
input_shape, attention_mask.shape
)
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
def forward(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
is_decoder=False,
mode='multimodal',
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
device = input_ids.device
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = inputs_embeds.device
elif encoder_embeds is not None:
input_shape = encoder_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = encoder_embeds.device
else:
raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
device, is_decoder)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if encoder_hidden_states is not None:
if type(encoder_hidden_states) == list:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
else:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if type(encoder_attention_mask) == list:
encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
elif encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
if encoder_embeds is None:
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
else:
embedding_output = encoder_embeds
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
mode=mode,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)
class BertLMHeadModel(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)
self.init_weights()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
def forward(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
return_logits=False,
is_decoder=True,
reduction='mean',
mode='multimodal',
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
Returns:
Example::
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
>>> import torch
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
>>> config = BertConfig.from_pretrained("bert-base-cased")
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> prediction_logits = outputs.logits
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
is_decoder=is_decoder,
mode=mode,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
if return_logits:
return prediction_scores[:, :-1, :].contiguous()
lm_loss = None
if labels is not None:
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if reduction=='none':
lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((lm_loss,) + output) if lm_loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=lm_loss,
logits=prediction_scores,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# cut decoder_input_ids if past is used
if past is not None:
input_ids = input_ids[:, -1:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past,
"encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
"encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
"is_decoder": True,
}
def _reorder_cache(self, past, beam_idx):
reordered_past = ()
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past

843
ldm/models/nlvr_encoder.py Normal file
View File

@ -0,0 +1,843 @@
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from torch import Tensor, device, dtype, nn
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from transformers.activations import ACT2FN
from transformers.file_utils import (
ModelOutput,
)
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
NextSentencePredictorOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from transformers.modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
logger = logging.get_logger(__name__)
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.config = config
def forward(
self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
embeddings = inputs_embeds
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertSelfAttention(nn.Module):
def __init__(self, config, is_cross_attention):
super().__init__()
self.config = config
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
if is_cross_attention:
self.key = nn.Linear(config.encoder_width, self.all_head_size)
self.value = nn.Linear(config.encoder_width, self.all_head_size)
else:
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.save_attention = False
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
seq_length = hidden_states.size()[1]
position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
if is_cross_attention and self.save_attention:
self.save_attention_map(attention_probs)
attention_probs.register_hook(self.save_attn_gradients)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs_dropped = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs_dropped = attention_probs_dropped * head_mask
context_layer = torch.matmul(attention_probs_dropped, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
outputs = outputs + (past_key_value,)
return outputs
class BertSelfOutput(nn.Module):
def __init__(self, config, twin=False, merge=False):
super().__init__()
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if twin:
self.dense0 = nn.Linear(config.hidden_size, config.hidden_size)
self.dense1 = nn.Linear(config.hidden_size, config.hidden_size)
else:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if merge:
self.act = ACT2FN[config.hidden_act]
self.merge_layer = nn.Linear(config.hidden_size * 2, config.hidden_size)
self.merge = True
else:
self.merge = False
def forward(self, hidden_states, input_tensor):
if type(hidden_states) == list:
hidden_states0 = self.dense0(hidden_states[0])
hidden_states1 = self.dense1(hidden_states[1])
if self.merge:
#hidden_states = self.merge_layer(self.act(torch.cat([hidden_states0,hidden_states1],dim=-1)))
hidden_states = self.merge_layer(torch.cat([hidden_states0,hidden_states1],dim=-1))
else:
hidden_states = (hidden_states0+hidden_states1)/2
else:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertAttention(nn.Module):
def __init__(self, config, is_cross_attention=False, layer_num=-1):
super().__init__()
if is_cross_attention:
self.self0 = BertSelfAttention(config, is_cross_attention)
self.self1 = BertSelfAttention(config, is_cross_attention)
else:
self.self = BertSelfAttention(config, is_cross_attention)
self.output = BertSelfOutput(config, twin=is_cross_attention, merge=(is_cross_attention and layer_num>=6))
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
if type(encoder_hidden_states)==list:
self_outputs0 = self.self0(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states[0],
encoder_attention_mask[0],
past_key_value,
output_attentions,
)
self_outputs1 = self.self1(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states[1],
encoder_attention_mask[1],
past_key_value,
output_attentions,
)
attention_output = self.output([self_outputs0[0],self_outputs1[0]], hidden_states)
outputs = (attention_output,) + self_outputs0[1:] # add attentions if we output them
else:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class BertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertLayer(nn.Module):
def __init__(self, config, layer_num):
super().__init__()
self.config = config
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BertAttention(config)
self.layer_num = layer_num
if self.config.add_cross_attention:
self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention, layer_num=layer_num)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
mode=None,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
if mode=='multimodal':
assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions=output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
mode='multimodal',
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warn(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, past_key_value, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
mode=mode,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
mode=mode,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class BertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertConfig
base_model_prefix = "bert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
class BertModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (:obj:`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (:obj:`Tuple[int]`):
The shape of the input to the model.
device: (:obj:`torch.device`):
The device of the input to the model.
Returns:
:obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
"""
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if is_decoder:
batch_size, seq_length = input_shape
seq_ids = torch.arange(seq_length, device=device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
# in case past_key_values are used we need to add a prefix ones mask to the causal mask
# causal and attention masks must have same type with pytorch version < 1.3
causal_mask = causal_mask.to(attention_mask.dtype)
if causal_mask.shape[1] < attention_mask.shape[1]:
prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
causal_mask = torch.cat(
[
torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
causal_mask,
],
axis=-1,
)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
input_shape, attention_mask.shape
)
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
def forward(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
is_decoder=False,
mode='multimodal',
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
device = input_ids.device
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = inputs_embeds.device
elif encoder_embeds is not None:
input_shape = encoder_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = encoder_embeds.device
else:
raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
device, is_decoder)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if encoder_hidden_states is not None:
if type(encoder_hidden_states) == list:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
else:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if type(encoder_attention_mask) == list:
encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
elif encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
if encoder_embeds is None:
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
else:
embedding_output = encoder_embeds
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
mode=mode,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)

305
ldm/models/vit.py Normal file
View File

@ -0,0 +1,305 @@
'''
* Copyright (c) 2022, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
* By Junnan Li
* Based on timm code base
* https://github.com/rwightman/pytorch-image-models/tree/master/timm
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial
from timm.models.vision_transformer import _cfg, PatchEmbed
from timm.models.registry import register_model
from timm.models.layers import trunc_normal_, DropPath
from timm.models.helpers import named_apply, adapt_input_conv
from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
class Mlp(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.attn_gradients = None
self.attention_map = None
def save_attn_gradients(self, attn_gradients):
self.attn_gradients = attn_gradients
def get_attn_gradients(self):
return self.attn_gradients
def save_attention_map(self, attention_map):
self.attention_map = attention_map
def get_attention_map(self):
return self.attention_map
def forward(self, x, register_hook=False):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
if register_hook:
self.save_attention_map(attn)
attn.register_hook(self.save_attn_gradients)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
if use_grad_checkpointing:
self.attn = checkpoint_wrapper(self.attn)
self.mlp = checkpoint_wrapper(self.mlp)
def forward(self, x, register_hook=False):
x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class VisionTransformer(nn.Module):
""" Vision Transformer
A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
https://arxiv.org/abs/2010.11929
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None,
use_grad_checkpointing=False, ckpt_layer=0):
"""
Args:
img_size (int, tuple): input image size
patch_size (int, tuple): patch size
in_chans (int): number of input channels
num_classes (int): number of classes for classification head
embed_dim (int): embedding dimension
depth (int): depth of transformer
num_heads (int): number of attention heads
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
qkv_bias (bool): enable bias for qkv if True
qk_scale (float): override default qk scale of head_dim ** -0.5 if set
representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
drop_rate (float): dropout rate
attn_drop_rate (float): attention dropout rate
drop_path_rate (float): stochastic depth rate
norm_layer: (nn.Module): normalization layer
"""
super().__init__()
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer)
)
for i in range(depth)])
self.norm = norm_layer(embed_dim)
trunc_normal_(self.pos_embed, std=.02)
trunc_normal_(self.cls_token, std=.02)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
def forward(self, x, register_blk=-1):
B = x.shape[0]
x = self.patch_embed(x)
cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
x = x + self.pos_embed[:,:x.size(1),:]
x = self.pos_drop(x)
for i,blk in enumerate(self.blocks):
x = blk(x, register_blk==i)
x = self.norm(x)
return x
@torch.jit.ignore()
def load_pretrained(self, checkpoint_path, prefix=''):
_load_weights(self, checkpoint_path, prefix)
@torch.no_grad()
def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
""" Load weights from .npz checkpoints for official Google Brain Flax implementation
"""
import numpy as np
def _n2p(w, t=True):
if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
w = w.flatten()
if t:
if w.ndim == 4:
w = w.transpose([3, 2, 0, 1])
elif w.ndim == 3:
w = w.transpose([2, 0, 1])
elif w.ndim == 2:
w = w.transpose([1, 0])
return torch.from_numpy(w)
w = np.load(checkpoint_path)
if not prefix and 'opt/target/embedding/kernel' in w:
prefix = 'opt/target/'
if hasattr(model.patch_embed, 'backbone'):
# hybrid
backbone = model.patch_embed.backbone
stem_only = not hasattr(backbone, 'stem')
stem = backbone if stem_only else backbone.stem
stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
if not stem_only:
for i, stage in enumerate(backbone.stages):
for j, block in enumerate(stage.blocks):
bp = f'{prefix}block{i + 1}/unit{j + 1}/'
for r in range(3):
getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
if block.downsample is not None:
block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
else:
embed_conv_w = adapt_input_conv(
model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
model.patch_embed.proj.weight.copy_(embed_conv_w)
model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
if pos_embed_w.shape != model.pos_embed.shape:
pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights
pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
model.pos_embed.copy_(pos_embed_w)
model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
# if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
# model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
# model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
# if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
# model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
# model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
for i, block in enumerate(model.blocks.children()):
block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
block.attn.qkv.weight.copy_(torch.cat([
_n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
block.attn.qkv.bias.copy_(torch.cat([
_n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
for r in range(2):
getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
# interpolate position embedding
embedding_size = pos_embed_checkpoint.shape[-1]
num_patches = visual_encoder.patch_embed.num_patches
num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
# height (== width) for the checkpoint position embedding
orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
# height (== width) for the new position embedding
new_size = int(num_patches ** 0.5)
if orig_size!=new_size:
# class_token and dist_token are kept unchanged
extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
# only the position tokens are interpolated
pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
pos_tokens = torch.nn.functional.interpolate(
pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2))
return new_pos_embed
else:
return pos_embed_checkpoint

View File

@ -5,7 +5,7 @@ import clip
from einops import rearrange, repeat
from transformers import CLIPTokenizer, CLIPTextModel
import kornia
import os
from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
@ -138,8 +138,12 @@ class FrozenCLIPEmbedder(AbstractEncoder):
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
super().__init__()
self.tokenizer = CLIPTokenizer.from_pretrained(version)
self.transformer = CLIPTextModel.from_pretrained(version)
if os.path.exists("models/clip-vit-large-patch14"):
self.tokenizer = CLIPTokenizer.from_pretrained("models/clip-vit-large-patch14")
self.transformer = CLIPTextModel.from_pretrained("models/clip-vit-large-patch14")
else:
self.tokenizer = CLIPTokenizer.from_pretrained(version)
self.transformer = CLIPTextModel.from_pretrained(version)
self.device = device
self.max_length = max_length
self.freeze()

Some files were not shown because too many files have changed in this diff Show More