diff --git a/Cargo.toml b/Cargo.toml index 3400588..ecd1cfa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ all-tests = [] features = [ "doc-only" ] [dependencies] -rust_tokenizers = "~3.1.5" +rust_tokenizers = {version = "~3.1.6", path = "E:/coding/backup-rust/rust-tokenizers/main/"} tch = "~0.1.7" serde_json = "1.0.51" serde = {version = "1.0.106", features = ["derive"]} diff --git a/requirements.txt b/requirements.txt index 000fefe..4663c9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ torch == 1.5.0 -transformers == 2.8.0 \ No newline at end of file +transformers == 2.10.0 \ No newline at end of file diff --git a/utils/download-dependencies_albert.py b/utils/download-dependencies_albert.py index 64bc432..58388a3 100644 --- a/utils/download-dependencies_albert.py +++ b/utils/download-dependencies_albert.py @@ -1,7 +1,6 @@ -from transformers import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_albert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,13 +10,13 @@ import subprocess config_path = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["albert-base-v2"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["albert-base-v2"] -weights_path = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP["albert-base-v2"] +weights_path = "albert-base-v2" target_path = Path.home() / 'rustbert' / 'albert-base-v2' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_bart.py b/utils/download-dependencies_bart.py index fbaa196..0361312 100644 --- a/utils/download-dependencies_bart.py +++ b/utils/download-dependencies_bart.py @@ -1,7 +1,6 @@ -from transformers import BART_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_bart import vocab_url, merges_url -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -12,14 +11,14 @@ import subprocess config_path = BART_PRETRAINED_CONFIG_ARCHIVE_MAP['bart-large'] vocab_path = vocab_url merges_path = merges_url -weights_path = BART_PRETRAINED_MODEL_ARCHIVE_MAP['bart-large'] +weights_path = 'bart-large' target_path = Path.home() / 'rustbert' / 'bart-large' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_bart_cnn.py b/utils/download-dependencies_bart_cnn.py index 1ec119c..b524cf8 100644 --- a/utils/download-dependencies_bart_cnn.py +++ b/utils/download-dependencies_bart_cnn.py @@ -1,7 +1,6 @@ -from transformers import BART_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_bart import vocab_url, merges_url -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -12,14 +11,14 @@ import subprocess config_path = BART_PRETRAINED_CONFIG_ARCHIVE_MAP['bart-large-cnn'] vocab_path = vocab_url merges_path = merges_url -weights_path = BART_PRETRAINED_MODEL_ARCHIVE_MAP['bart-large-cnn'] +weights_path = 'bart-large-cnn' target_path = Path.home() / 'rustbert' / 'bart-large-cnn' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_bart_xsum.py b/utils/download-dependencies_bart_xsum.py index c1e79a8..8aff45f 100644 --- a/utils/download-dependencies_bart_xsum.py +++ b/utils/download-dependencies_bart_xsum.py @@ -1,7 +1,6 @@ -from transformers import BART_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_bart import vocab_url, merges_url -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -12,14 +11,14 @@ import subprocess config_path = BART_PRETRAINED_CONFIG_ARCHIVE_MAP['bart-large-xsum'] vocab_path = vocab_url merges_path = merges_url -weights_path = BART_PRETRAINED_MODEL_ARCHIVE_MAP['bart-large-xsum'] +weights_path = 'bart-large-xsum' target_path = Path.home() / 'rustbert' / 'bart-large-xsum' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_bert.py b/utils/download-dependencies_bert.py index 2078407..2c79784 100644 --- a/utils/download-dependencies_bert.py +++ b/utils/download-dependencies_bert.py @@ -1,6 +1,6 @@ -from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_bert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP["bert-base-uncased"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["bert-base-uncased"] -weights_path = BERT_PRETRAINED_MODEL_ARCHIVE_MAP["bert-base-uncased"] +weights_path = "bert-base-uncased" target_path = Path.home() / 'rustbert' / 'bert' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_bert_qa.py b/utils/download-dependencies_bert_qa.py index 7d68baa..f0595d9 100644 --- a/utils/download-dependencies_bert_qa.py +++ b/utils/download-dependencies_bert_qa.py @@ -1,6 +1,6 @@ -from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_bert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP["bert-large-cased-whole-word-masking-finetuned-squad"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["bert-large-cased-whole-word-masking-finetuned-squad"] -weights_path = BERT_PRETRAINED_MODEL_ARCHIVE_MAP["bert-large-cased-whole-word-masking-finetuned-squad"] +weights_path = "bert-large-cased-whole-word-masking-finetuned-squad" target_path = Path.home() / 'rustbert' / 'bert-qa' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_distilbert-qa.py b/utils/download-dependencies_distilbert-qa.py index 5141881..29b026e 100644 --- a/utils/download-dependencies_distilbert-qa.py +++ b/utils/download-dependencies_distilbert-qa.py @@ -1,6 +1,6 @@ -from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_distilbert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["distilbert-base-cased-distilled-squad"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["distilbert-base-cased-distilled-squad"] -weights_path = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP["distilbert-base-cased-distilled-squad"] +weights_path = "distilbert-base-cased-distilled-squad" target_path = Path.home() / 'rustbert' / 'distilbert-qa' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_distilbert.py b/utils/download-dependencies_distilbert.py index 4f2b524..11cc339 100644 --- a/utils/download-dependencies_distilbert.py +++ b/utils/download-dependencies_distilbert.py @@ -1,6 +1,6 @@ -from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_distilbert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["distilbert-base-uncased"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["distilbert-base-uncased"] -weights_path = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP["distilbert-base-uncased"] +weights_path = "distilbert-base-uncased" target_path = Path.home() / 'rustbert' / 'distilbert' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_distilgpt2.py b/utils/download-dependencies_distilgpt2.py index 6123c7a..52ce411 100644 --- a/utils/download-dependencies_distilgpt2.py +++ b/utils/download-dependencies_distilgpt2.py @@ -1,6 +1,6 @@ -from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_gpt2 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP["distilgpt2"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["distilgpt2"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["distilgpt2"] -weights_path = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP["distilgpt2"] +weights_path = "distilgpt2" target_path = Path.home() / 'rustbert' / 'distilgpt2' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_electra-discriminator.py b/utils/download-dependencies_electra-discriminator.py index d687e31..b9c4435 100644 --- a/utils/download-dependencies_electra-discriminator.py +++ b/utils/download-dependencies_electra-discriminator.py @@ -1,6 +1,6 @@ -from transformers import ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_electra import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP["google/electra-base-discriminator"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["google/electra-base-discriminator"] -weights_path = ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP["google/electra-base-discriminator"] +weights_path = "google/electra-base-discriminator" target_path = Path.home() / 'rustbert' / 'electra-discriminator' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_electra-generator.py b/utils/download-dependencies_electra-generator.py index 9834acf..ee7fb44 100644 --- a/utils/download-dependencies_electra-generator.py +++ b/utils/download-dependencies_electra-generator.py @@ -1,6 +1,6 @@ -from transformers import ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_electra import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP["google/electra-base-generator"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["google/electra-base-generator"] -weights_path = ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP["google/electra-base-generator"] +weights_path = "google/electra-base-generator" target_path = Path.home() / 'rustbert' / 'electra-generator' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_gpt2-large.py b/utils/download-dependencies_gpt2-large.py index 7a447c6..1cf7290 100644 --- a/utils/download-dependencies_gpt2-large.py +++ b/utils/download-dependencies_gpt2-large.py @@ -1,6 +1,6 @@ -from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_gpt2 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP["gpt2-large"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["gpt2-large"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["gpt2-large"] -weights_path = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP["gpt2-large"] +weights_path = "gpt2-large" target_path = Path.home() / 'rustbert' / 'gpt2-large' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_gpt2-medium.py b/utils/download-dependencies_gpt2-medium.py index 9dc6291..0c75847 100644 --- a/utils/download-dependencies_gpt2-medium.py +++ b/utils/download-dependencies_gpt2-medium.py @@ -1,6 +1,6 @@ -from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_gpt2 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP["gpt2-medium"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["gpt2-medium"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["gpt2-medium"] -weights_path = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP["gpt2-medium"] +weights_path = "gpt2-medium" target_path = Path.home() / 'rustbert' / 'gpt2-medium' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_gpt2-xl.py b/utils/download-dependencies_gpt2-xl.py index 4b0916a..e95bc44 100644 --- a/utils/download-dependencies_gpt2-xl.py +++ b/utils/download-dependencies_gpt2-xl.py @@ -1,6 +1,6 @@ -from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_gpt2 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP["gpt2-xl"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["gpt2-xl"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["gpt2-xl"] -weights_path = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP["gpt2-xl"] +weights_path = "gpt2-xl" target_path = Path.home() / 'rustbert' / 'gpt2-xl' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_gpt2.py b/utils/download-dependencies_gpt2.py index 34f0b48..75193e1 100644 --- a/utils/download-dependencies_gpt2.py +++ b/utils/download-dependencies_gpt2.py @@ -1,6 +1,6 @@ -from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_gpt2 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP["gpt2"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["gpt2"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["gpt2"] -weights_path = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP["gpt2"] +weights_path = "gpt2" target_path = Path.home() / 'rustbert' / 'gpt2' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_openaigpt.py b/utils/download-dependencies_openaigpt.py index 8d2cc84..5a3335b 100644 --- a/utils/download-dependencies_openaigpt.py +++ b/utils/download-dependencies_openaigpt.py @@ -1,6 +1,6 @@ -from transformers import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_openai import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP["openai-gpt"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["openai-gpt"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["openai-gpt"] -weights_path = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP["openai-gpt"] +weights_path = "openai-gpt" target_path = Path.home() / 'rustbert' / 'openai-gpt' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_roberta.py b/utils/download-dependencies_roberta.py index 4ef29c9..8f7b024 100644 --- a/utils/download-dependencies_roberta.py +++ b/utils/download-dependencies_roberta.py @@ -1,6 +1,6 @@ -from transformers import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_roberta import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -11,14 +11,14 @@ import subprocess config_path = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP["roberta-base"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["roberta-base"] merges_path = PRETRAINED_VOCAB_FILES_MAP["merges_file"]["roberta-base"] -weights_path = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP["roberta-base"] +weights_path = "roberta-base" target_path = Path.home() / 'rustbert' / 'roberta' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) temp_merges = get_from_cache(merges_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_roberta_qa.py b/utils/download-dependencies_roberta_qa.py new file mode 100644 index 0000000..c819f99 --- /dev/null +++ b/utils/download-dependencies_roberta_qa.py @@ -0,0 +1,52 @@ +from transformers.file_utils import get_from_cache, S3_BUCKET_PREFIX +from pathlib import Path +import shutil +import os +import numpy as np +import torch +import subprocess + +ROOT_PATH = S3_BUCKET_PREFIX + '/deepset/roberta-base-squad2' + +config_path = ROOT_PATH + '/config.json' +vocab_path = ROOT_PATH + '/vocab.json' +merges_path = ROOT_PATH + '/merges.txt' +weights_path = ROOT_PATH + '/pytorch_model.bin' + +target_path = Path.home() / 'rustbert' / 'roberta-qa' + +temp_config = get_from_cache(config_path) +temp_vocab = get_from_cache(vocab_path) +temp_merges = get_from_cache(merges_path) +temp_weights = get_from_cache(weights_path) + +os.makedirs(str(target_path), exist_ok=True) + +config_path = str(target_path / 'config.json') +vocab_path = str(target_path / 'vocab.json') +merges_path = str(target_path / 'merges.txt') +model_path = str(target_path / 'model.bin') + +shutil.copy(temp_config, config_path) +shutil.copy(temp_vocab, vocab_path) +shutil.copy(temp_merges, merges_path) +shutil.copy(temp_weights, model_path) + +weights = torch.load(temp_weights, map_location='cpu') +nps = {} +for k, v in weights.items(): + k = k.replace("gamma", "weight").replace("beta", "bias") + nps[k] = np.ascontiguousarray(v.cpu().numpy()) + +np.savez(target_path / 'model.npz', **nps) + +source = str(target_path / 'model.npz') +target = str(target_path / 'model.ot') + +toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve() + +subprocess.call( + ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target]) + +os.remove(str(target_path / 'model.bin')) +os.remove(str(target_path / 'model.npz')) diff --git a/utils/download-dependencies_sst2_sentiment.py b/utils/download-dependencies_sst2_sentiment.py index c9efd3b..fd5a430 100644 --- a/utils/download-dependencies_sst2_sentiment.py +++ b/utils/download-dependencies_sst2_sentiment.py @@ -1,6 +1,6 @@ -from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_distilbert import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP["distilbert-base-uncased-finetuned-sst-2-english"] vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["distilbert-base-uncased"] -weights_path = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP["distilbert-base-uncased-finetuned-sst-2-english"] +weights_path = "distilbert-base-uncased-finetuned-sst-2-english" target_path = Path.home() / 'rustbert' / 'distilbert-sst2' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_t5_base.py b/utils/download-dependencies_t5_base.py index 4274ea1..c711e61 100644 --- a/utils/download-dependencies_t5_base.py +++ b/utils/download-dependencies_t5_base.py @@ -1,6 +1,6 @@ -from transformers import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers import T5_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_t5 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = T5_PRETRAINED_CONFIG_ARCHIVE_MAP['t5-base'] vocab_path = PRETRAINED_VOCAB_FILES_MAP['vocab_file']['t5-base'] -weights_path = T5_PRETRAINED_MODEL_ARCHIVE_MAP['t5-base'] +weights_path = 't5-base' target_path = Path.home() / 'rustbert' / 't5-base' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_t5_small.py b/utils/download-dependencies_t5_small.py index 31e754d..bcb47e3 100644 --- a/utils/download-dependencies_t5_small.py +++ b/utils/download-dependencies_t5_small.py @@ -1,6 +1,6 @@ -from transformers import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers import T5_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers.tokenization_t5 import PRETRAINED_VOCAB_FILES_MAP -from transformers.file_utils import get_from_cache +from transformers.file_utils import get_from_cache, hf_bucket_url from pathlib import Path import shutil import os @@ -10,13 +10,13 @@ import subprocess config_path = T5_PRETRAINED_CONFIG_ARCHIVE_MAP['t5-small'] vocab_path = PRETRAINED_VOCAB_FILES_MAP['vocab_file']['t5-small'] -weights_path = T5_PRETRAINED_MODEL_ARCHIVE_MAP['t5-small'] +weights_path = 't5-small' target_path = Path.home() / 'rustbert' / 't5-small' temp_config = get_from_cache(config_path) temp_vocab = get_from_cache(vocab_path) -temp_weights = get_from_cache(weights_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) os.makedirs(str(target_path), exist_ok=True) diff --git a/utils/download-dependencies_xlm_roberta_ner_dutch.py b/utils/download-dependencies_xlm_roberta_ner_dutch.py new file mode 100644 index 0000000..c63ee84 --- /dev/null +++ b/utils/download-dependencies_xlm_roberta_ner_dutch.py @@ -0,0 +1,48 @@ +from transformers import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers.tokenization_xlm_roberta import PRETRAINED_VOCAB_FILES_MAP +from transformers.file_utils import get_from_cache,hf_bucket_url +from pathlib import Path +import shutil +import os +import numpy as np +import torch +import subprocess + +config_path = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP["xlm-roberta-large-finetuned-conll02-dutch"] +vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["xlm-roberta-large-finetuned-conll02-dutch"] +weights_path = "xlm-roberta-large-finetuned-conll02-dutch" + +target_path = Path.home() / 'rustbert' / 'xlm-roberta-ner-nl' + +temp_config = get_from_cache(config_path) +temp_vocab = get_from_cache(vocab_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) + +os.makedirs(str(target_path), exist_ok=True) + +config_path = str(target_path / 'config.json') +vocab_path = str(target_path / 'spiece.model') +model_path = str(target_path / 'model.bin') + +shutil.copy(temp_config, config_path) +shutil.copy(temp_vocab, vocab_path) +shutil.copy(temp_weights, model_path) + +weights = torch.load(temp_weights, map_location='cpu') +nps = {} +for k, v in weights.items(): + k = k.replace("gamma", "weight").replace("beta", "bias") + nps[k] = np.ascontiguousarray(v.cpu().numpy()) + +np.savez(target_path / 'model.npz', **nps) + +source = str(target_path / 'model.npz') +target = str(target_path / 'model.ot') + +toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve() + +subprocess.call( + ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target]) + +os.remove(str(target_path / 'model.bin')) +os.remove(str(target_path / 'model.npz')) diff --git a/utils/download-dependencies_xlm_roberta_ner_english.py b/utils/download-dependencies_xlm_roberta_ner_english.py new file mode 100644 index 0000000..33c60dc --- /dev/null +++ b/utils/download-dependencies_xlm_roberta_ner_english.py @@ -0,0 +1,48 @@ +from transformers import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers.tokenization_xlm_roberta import PRETRAINED_VOCAB_FILES_MAP +from transformers.file_utils import get_from_cache,hf_bucket_url +from pathlib import Path +import shutil +import os +import numpy as np +import torch +import subprocess + +config_path = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP["xlm-roberta-large-finetuned-conll03-english"] +vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["xlm-roberta-large-finetuned-conll03-english"] +weights_path = "xlm-roberta-large-finetuned-conll03-english" + +target_path = Path.home() / 'rustbert' / 'xlm-roberta-ner-en' + +temp_config = get_from_cache(config_path) +temp_vocab = get_from_cache(vocab_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) + +os.makedirs(str(target_path), exist_ok=True) + +config_path = str(target_path / 'config.json') +vocab_path = str(target_path / 'spiece.model') +model_path = str(target_path / 'model.bin') + +shutil.copy(temp_config, config_path) +shutil.copy(temp_vocab, vocab_path) +shutil.copy(temp_weights, model_path) + +weights = torch.load(temp_weights, map_location='cpu') +nps = {} +for k, v in weights.items(): + k = k.replace("gamma", "weight").replace("beta", "bias") + nps[k] = np.ascontiguousarray(v.cpu().numpy()) + +np.savez(target_path / 'model.npz', **nps) + +source = str(target_path / 'model.npz') +target = str(target_path / 'model.ot') + +toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve() + +subprocess.call( + ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target]) + +os.remove(str(target_path / 'model.bin')) +os.remove(str(target_path / 'model.npz')) diff --git a/utils/download-dependencies_xlm_roberta_ner_german.py b/utils/download-dependencies_xlm_roberta_ner_german.py new file mode 100644 index 0000000..42ef2f3 --- /dev/null +++ b/utils/download-dependencies_xlm_roberta_ner_german.py @@ -0,0 +1,48 @@ +from transformers import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers.tokenization_xlm_roberta import PRETRAINED_VOCAB_FILES_MAP +from transformers.file_utils import get_from_cache,hf_bucket_url +from pathlib import Path +import shutil +import os +import numpy as np +import torch +import subprocess + +config_path = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP["xlm-roberta-large-finetuned-conll03-german"] +vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["xlm-roberta-large-finetuned-conll03-german"] +weights_path = "xlm-roberta-large-finetuned-conll03-german" + +target_path = Path.home() / 'rustbert' / 'xlm-roberta-ner-de' + +temp_config = get_from_cache(config_path) +temp_vocab = get_from_cache(vocab_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) + +os.makedirs(str(target_path), exist_ok=True) + +config_path = str(target_path / 'config.json') +vocab_path = str(target_path / 'spiece.model') +model_path = str(target_path / 'model.bin') + +shutil.copy(temp_config, config_path) +shutil.copy(temp_vocab, vocab_path) +shutil.copy(temp_weights, model_path) + +weights = torch.load(temp_weights, map_location='cpu') +nps = {} +for k, v in weights.items(): + k = k.replace("gamma", "weight").replace("beta", "bias") + nps[k] = np.ascontiguousarray(v.cpu().numpy()) + +np.savez(target_path / 'model.npz', **nps) + +source = str(target_path / 'model.npz') +target = str(target_path / 'model.ot') + +toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve() + +subprocess.call( + ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target]) + +os.remove(str(target_path / 'model.bin')) +os.remove(str(target_path / 'model.npz')) diff --git a/utils/download-dependencies_xlm_roberta_ner_spanish.py b/utils/download-dependencies_xlm_roberta_ner_spanish.py new file mode 100644 index 0000000..edacb95 --- /dev/null +++ b/utils/download-dependencies_xlm_roberta_ner_spanish.py @@ -0,0 +1,48 @@ +from transformers import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from transformers.tokenization_xlm_roberta import PRETRAINED_VOCAB_FILES_MAP +from transformers.file_utils import get_from_cache,hf_bucket_url +from pathlib import Path +import shutil +import os +import numpy as np +import torch +import subprocess + +config_path = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP["xlm-roberta-large-finetuned-conll02-spanish"] +vocab_path = PRETRAINED_VOCAB_FILES_MAP["vocab_file"]["xlm-roberta-large-finetuned-conll02-spanish"] +weights_path = "xlm-roberta-large-finetuned-conll02-spanish" + +target_path = Path.home() / 'rustbert' / 'xlm-roberta-ner-es' + +temp_config = get_from_cache(config_path) +temp_vocab = get_from_cache(vocab_path) +temp_weights = get_from_cache(hf_bucket_url(weights_path, filename="pytorch_model.bin", use_cdn=True)) + +os.makedirs(str(target_path), exist_ok=True) + +config_path = str(target_path / 'config.json') +vocab_path = str(target_path / 'spiece.model') +model_path = str(target_path / 'model.bin') + +shutil.copy(temp_config, config_path) +shutil.copy(temp_vocab, vocab_path) +shutil.copy(temp_weights, model_path) + +weights = torch.load(temp_weights, map_location='cpu') +nps = {} +for k, v in weights.items(): + k = k.replace("gamma", "weight").replace("beta", "bias") + nps[k] = np.ascontiguousarray(v.cpu().numpy()) + +np.savez(target_path / 'model.npz', **nps) + +source = str(target_path / 'model.npz') +target = str(target_path / 'model.ot') + +toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve() + +subprocess.call( + ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target]) + +os.remove(str(target_path / 'model.bin')) +os.remove(str(target_path / 'model.npz'))