fairseq/tests/utils.py

798 lines
24 KiB
Python
Raw Normal View History

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import json
import os
import random
import shutil
import string
import sys
import typing as tp
from io import StringIO
import torch
import torch.nn.functional as F
import fairseq.distributed.utils as distributed_utils
from fairseq import options, utils
from fairseq.data import Dictionary
from fairseq.data.language_pair_dataset import collate
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
from fairseq.models import (
FairseqEncoder,
FairseqEncoderDecoderModel,
FairseqIncrementalDecoder,
)
from fairseq.models.fairseq_encoder import EncoderOut
from fairseq.tasks import LegacyFairseqTask
from fairseq_cli import generate, interactive, preprocess, train, validate
def dummy_dictionary(vocab_size, prefix="token_"):
d = Dictionary()
for i in range(vocab_size):
token = prefix + str(i)
d.add_symbol(token)
2018-04-24 21:46:54 +03:00
d.finalize(padding_factor=1) # don't add extra padding symbols
return d
def dummy_dataloader(
samples,
padding_idx=1,
eos_idx=2,
batch_size=None,
):
if batch_size is None:
batch_size = len(samples)
# add any missing data to samples
for i, sample in enumerate(samples):
if "id" not in sample:
sample["id"] = i
# create dataloader
dataset = TestDataset(samples)
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)),
)
return iter(dataloader)
def sequence_generator_setup():
# construct dummy dictionary
d = dummy_dictionary(vocab_size=2)
eos = d.eos()
w1 = 4
w2 = 5
# construct source data
src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
src_lengths = torch.LongTensor([2, 2])
args = argparse.Namespace()
unk = 0.0
args.beam_probs = [
# step 0:
torch.FloatTensor(
[
# eos w1 w2
# sentence 1:
[0.0, unk, 0.9, 0.1], # beam 1
[0.0, unk, 0.9, 0.1], # beam 2
# sentence 2:
[0.0, unk, 0.7, 0.3],
[0.0, unk, 0.7, 0.3],
]
),
# step 1:
torch.FloatTensor(
[
# eos w1 w2 prefix
# sentence 1:
[1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 <eos>: 0.9*1.0)
[0.0, unk, 0.9, 0.1], # w2: 0.1
# sentence 2:
[0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 <eos>: 0.7*0.25)
[0.00, unk, 0.10, 0.9], # w2: 0.3
]
),
# step 2:
torch.FloatTensor(
[
# eos w1 w2 prefix
# sentence 1:
[0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9
[
0.6,
unk,
0.2,
0.2,
], # w2 w2: 0.1*0.1 (emit: w2 w2 <eos>: 0.1*0.1*0.6)
# sentence 2:
[
0.60,
unk,
0.4,
0.00,
], # w1 w2: 0.7*0.4 (emit: w1 w2 <eos>: 0.7*0.4*0.6)
[0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9
]
),
# step 3:
torch.FloatTensor(
[
# eos w1 w2 prefix
# sentence 1:
[
1.0,
unk,
0.0,
0.0,
], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0)
[
1.0,
unk,
0.0,
0.0,
], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0)
# sentence 2:
[
0.1,
unk,
0.5,
0.4,
], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1)
[
1.0,
unk,
0.0,
0.0,
], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0)
]
),
]
task = TestTranslationTask.setup_task(args, d, d)
model = task.build_model(args)
tgt_dict = task.target_dictionary
return tgt_dict, w1, w2, src_tokens, src_lengths, model
def create_dummy_data(
data_dir, num_examples=100, maxlen=20, alignment=False, languages=None
):
def _create_dummy_data(dir, filename):
data = torch.rand(num_examples * maxlen)
data = 97 + torch.floor(26 * data).int()
with open(os.path.join(dir, filename), "w") as h:
offset = 0
for _ in range(num_examples):
ex_len = random.randint(1, maxlen)
ex_str = " ".join(map(chr, data[offset : offset + ex_len]))
print(ex_str, file=h)
offset += ex_len
def _create_dummy_alignment_data(filename_src, filename_tgt, filename):
with open(os.path.join(data_dir, filename_src), "r") as src_f, open(
os.path.join(data_dir, filename_tgt), "r"
) as tgt_f, open(os.path.join(data_dir, filename), "w") as h:
for src, tgt in zip(src_f, tgt_f):
src_len = len(src.split())
tgt_len = len(tgt.split())
avg_len = (src_len + tgt_len) // 2
num_alignments = random.randint(avg_len // 2, 2 * avg_len)
src_indices = torch.floor(torch.rand(num_alignments) * src_len).int()
tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int()
ex_str = " ".join(
[
"{}-{}".format(src, tgt)
for src, tgt in zip(src_indices, tgt_indices)
]
)
print(ex_str, file=h)
files_to_write = [
"train.in",
"train.out",
"valid.in",
"valid.out",
"test.in",
"test.out",
]
if languages is None: # En only dummy dataset
for f in files_to_write:
_create_dummy_data(data_dir, f)
else:
for lang in languages:
lang_dir = os.path.join(data_dir, lang)
os.makedirs(lang_dir, exist_ok=True)
for f in files_to_write:
_create_dummy_data(lang_dir, f)
if alignment:
_create_dummy_alignment_data("train.in", "train.out", "train.align")
_create_dummy_alignment_data("valid.in", "valid.out", "valid.align")
_create_dummy_alignment_data("test.in", "test.out", "test.align")
def preprocess_lm_data(data_dir, languages=None):
preprocess_parser = options.get_preprocessing_parser()
if languages is None:
preprocess_args = preprocess_parser.parse_args(
[
"--only-source",
"--trainpref",
os.path.join(data_dir, "train.out"),
"--validpref",
os.path.join(data_dir, "valid.out"),
"--testpref",
os.path.join(data_dir, "test.out"),
"--destdir",
data_dir,
]
)
preprocess.main(preprocess_args)
else:
for lang in languages:
lang_dir = os.path.join(data_dir, lang)
assert os.path.exists(lang_dir)
preprocess_args = preprocess_parser.parse_args(
[
"--only-source",
"--trainpref",
os.path.join(lang_dir, "train.out"),
"--validpref",
os.path.join(lang_dir, "valid.out"),
"--testpref",
os.path.join(lang_dir, "test.out"),
"--destdir",
lang_dir,
]
)
preprocess.main(preprocess_args)
shutil.copyfile(
os.path.join(data_dir, languages[0], "dict.txt"),
os.path.join(data_dir, "dict.txt"),
)
def preprocess_translation_data(data_dir, extra_flags=None):
preprocess_parser = options.get_preprocessing_parser()
preprocess_args = preprocess_parser.parse_args(
[
"--source-lang",
"in",
"--target-lang",
"out",
"--trainpref",
os.path.join(data_dir, "train"),
"--validpref",
os.path.join(data_dir, "valid"),
"--testpref",
os.path.join(data_dir, "test"),
"--thresholdtgt",
"0",
"--thresholdsrc",
"0",
"--destdir",
data_dir,
]
+ (extra_flags or []),
)
preprocess.main(preprocess_args)
Transformer with integrated pointer-generator network (#2529) Summary: This pull request implements a variant of the Transformer model that uses an attention distribution for pointing to input words. The attention distribution over the input words is interpolated with the normal output distribution over the vocabulary words, as in [See et al. (2017)](https://arxiv.org/abs/1704.04368). This allows the model to generate words that appear in the input, even if they don't appear in the vocabulary, helping especially with small vocabularies. The mechanism for copying out-of-vocabulary words from the input has been implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator) they convey the word identities through the model in order to be able to produce out-of-vocabulary words. We wanted to minimize changes to the Fairseq code base and took a different approach, which I'll describe below. The entire implementation is contained in one file (plus there's one new test). Copying out-of-vocabulary words is possible by pre-processing the input and post-processing the output. The user may add special words to the end of the vocabulary that can be used in place of `<unk>` tokens to identify different input positions (e.g. `<unk-0>`, `<unk-1>`, `<unk-2>`, ...). The number of these special words is given to the model with the `--source-position-markers` argument—the model simply maps all of these to the same word embedding as `<unk>`. With a simple post-processing the user may retrieve word at position N in the original text and use it in place of `<unk-N>`. I didn't find a good place to document this usage of this model, so let me know if you think I should improve documentation somewhere. This feature has not yet been discussed via a GitHub issue, but I'll open a new issue for discussion. Pull Request resolved: https://github.com/pytorch/fairseq/pull/2529 Reviewed By: ngoyal2707 Differential Revision: D23398430 Pulled By: myleott fbshipit-source-id: f2f26c8ce8802ae6cf95515637660348ff3fc457
2020-09-25 18:28:10 +03:00
def preprocess_summarization_data(data_dir, extra_flags=None):
preprocess_parser = options.get_preprocessing_parser()
preprocess_args = preprocess_parser.parse_args(
[
"--source-lang",
"in",
"--target-lang",
"out",
"--trainpref",
os.path.join(data_dir, "train"),
"--validpref",
os.path.join(data_dir, "valid"),
"--testpref",
os.path.join(data_dir, "test"),
"--thresholdtgt",
"0",
"--thresholdsrc",
"0",
"--joined-dictionary",
"--destdir",
data_dir,
]
+ (extra_flags or []),
Transformer with integrated pointer-generator network (#2529) Summary: This pull request implements a variant of the Transformer model that uses an attention distribution for pointing to input words. The attention distribution over the input words is interpolated with the normal output distribution over the vocabulary words, as in [See et al. (2017)](https://arxiv.org/abs/1704.04368). This allows the model to generate words that appear in the input, even if they don't appear in the vocabulary, helping especially with small vocabularies. The mechanism for copying out-of-vocabulary words from the input has been implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator) they convey the word identities through the model in order to be able to produce out-of-vocabulary words. We wanted to minimize changes to the Fairseq code base and took a different approach, which I'll describe below. The entire implementation is contained in one file (plus there's one new test). Copying out-of-vocabulary words is possible by pre-processing the input and post-processing the output. The user may add special words to the end of the vocabulary that can be used in place of `<unk>` tokens to identify different input positions (e.g. `<unk-0>`, `<unk-1>`, `<unk-2>`, ...). The number of these special words is given to the model with the `--source-position-markers` argument—the model simply maps all of these to the same word embedding as `<unk>`. With a simple post-processing the user may retrieve word at position N in the original text and use it in place of `<unk-N>`. I didn't find a good place to document this usage of this model, so let me know if you think I should improve documentation somewhere. This feature has not yet been discussed via a GitHub issue, but I'll open a new issue for discussion. Pull Request resolved: https://github.com/pytorch/fairseq/pull/2529 Reviewed By: ngoyal2707 Differential Revision: D23398430 Pulled By: myleott fbshipit-source-id: f2f26c8ce8802ae6cf95515637660348ff3fc457
2020-09-25 18:28:10 +03:00
)
preprocess.main(preprocess_args)
def create_laser_data_and_config_json(data_dir):
src_langs = ["de", "fr", "ru", "tr", "zh"]
tgt_langs = ["en", "es"]
config_json = {}
config_train_json = []
src_vocab = None
tgt_vocab = None
for src_lang in src_langs:
for tgt_lang in tgt_langs:
langpair_folder = f"{src_lang}-{tgt_lang}"
langpair_path = os.path.join(data_dir, langpair_folder)
os.mkdir(langpair_path)
create_dummy_data(langpair_path)
preprocess_translation_data(langpair_path, ["--dataset-impl", "cached"])
src_vocab = os.path.join(langpair_path, "dict.in.txt")
tgt_vocab = os.path.join(langpair_path, "dict.out.txt")
config_train_json.append(
{
"id": 0 if tgt_lang == "en" else 1,
"src": os.path.join(langpair_path, "train.in-out.in"),
"tgt": os.path.join(langpair_path, "train.in-out.out"),
}
)
config_json["src_vocab"] = src_vocab
config_json["tgt_vocab"] = tgt_vocab
config_json["train"] = config_train_json
with open(os.path.join(data_dir, "laserconfig.json"), "w") as config_file:
json.dump(config_json, config_file)
return config_file
def train_translation_model(
data_dir,
arch,
extra_flags=None,
task="translation",
run_validation=False,
lang_flags=None,
extra_valid_flags=None,
world_size=1,
):
if lang_flags is None:
lang_flags = [
"--source-lang",
"in",
"--target-lang",
"out",
]
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(
train_parser,
[
"--task",
task,
data_dir,
"--save-dir",
data_dir,
"--arch",
arch,
"--optimizer",
"nag",
"--lr",
"0.05",
"--max-tokens",
"500",
"--max-epoch",
"1",
"--no-progress-bar",
"--distributed-world-size",
str(world_size),
"--num-workers",
"0",
]
+ lang_flags
+ (extra_flags or []),
)
cfg = convert_namespace_to_omegaconf(train_args)
distributed_utils.call_main(cfg, train.main)
if run_validation:
# test validation
validate_parser = options.get_validation_parser()
validate_args = options.parse_args_and_arch(
validate_parser,
[
"--task",
task,
data_dir,
"--path",
os.path.join(data_dir, "checkpoint_last.pt"),
"--valid-subset",
"valid",
"--max-tokens",
"500",
"--no-progress-bar",
"--num-workers",
"0",
]
+ lang_flags
+ (extra_valid_flags or []),
)
validate.main(validate_args)
def generate_main(data_dir, extra_flags=None, path=None):
if extra_flags is None:
extra_flags = [
"--print-alignment",
]
if path is None:
path = os.path.join(data_dir, "checkpoint_last.pt")
generate_parser = options.get_generation_parser()
generate_args = options.parse_args_and_arch(
generate_parser,
[
data_dir,
"--path",
path,
"--beam",
"3",
"--batch-size",
"64",
"--max-len-b",
"5",
"--gen-subset",
"valid",
"--no-progress-bar",
"--num-workers",
"0",
]
+ (extra_flags or []),
)
# evaluate model in batch mode
generate.main(generate_args)
# evaluate model interactively
generate_args.buffer_size = 0
generate_args.input = "-"
generate_args.batch_size = None
orig_stdin = sys.stdin
sys.stdin = StringIO("h e l l o\n")
interactive.main(generate_args)
sys.stdin = orig_stdin
class TestDataset(torch.utils.data.Dataset):
def __init__(self, data):
super().__init__()
self.data = data
self.sizes = None
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
class TestTranslationTask(LegacyFairseqTask):
def __init__(self, args, src_dict, tgt_dict, model):
super().__init__(args)
self.src_dict = src_dict
self.tgt_dict = tgt_dict
self.model = model
@classmethod
def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None):
return cls(args, src_dict, tgt_dict, model)
def build_model(self, args, from_checkpoint=False):
return TestModel.build_model(args, self)
@property
def source_dictionary(self):
return self.src_dict
@property
def target_dictionary(self):
return self.tgt_dict
class TestModel(FairseqEncoderDecoderModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@classmethod
def build_model(cls, args, task):
encoder = TestEncoder(args, task.source_dictionary)
decoder = TestIncrementalDecoder(args, task.target_dictionary)
return cls(encoder, decoder)
class TestEncoder(FairseqEncoder):
def __init__(self, args, dictionary):
super().__init__(dictionary)
self.args = args
def forward(self, src_tokens, src_lengths=None, **kwargs):
return EncoderOut(
encoder_out=src_tokens,
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
def reorder_encoder_out(self, encoder_out, new_order):
return EncoderOut(
encoder_out=encoder_out.encoder_out.index_select(0, new_order),
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
class TestIncrementalDecoder(FairseqIncrementalDecoder):
def __init__(self, args, dictionary):
super().__init__(dictionary)
assert hasattr(args, "beam_probs") or hasattr(args, "probs")
args.max_decoder_positions = getattr(args, "max_decoder_positions", 100)
self.args = args
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
bbsz = prev_output_tokens.size(0)
vocab = len(self.dictionary)
src_len = encoder_out.encoder_out.size(1)
tgt_len = prev_output_tokens.size(1)
# determine number of steps
if incremental_state is not None:
# cache step number
step = utils.get_incremental_state(self, incremental_state, "step")
if step is None:
step = 0
utils.set_incremental_state(self, incremental_state, "step", step + 1)
steps = [step]
else:
steps = list(range(tgt_len))
# define output in terms of raw probs
if hasattr(self.args, "probs"):
assert (
self.args.probs.dim() == 3
), "expected probs to have size bsz*steps*vocab"
probs = self.args.probs.index_select(1, torch.LongTensor(steps))
else:
probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_()
for i, step in enumerate(steps):
# args.beam_probs gives the probability for every vocab element,
# starting with eos, then unknown, and then the rest of the vocab
if step < len(self.args.beam_probs):
probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step]
else:
probs[:, i, self.dictionary.eos()] = 1.0
# random attention
attn = torch.rand(bbsz, tgt_len, src_len)
dev = prev_output_tokens.device
return probs.to(dev), {"attn": [attn.to(dev)]}
Conv lm implementation This implements convolutional language model from https://arxiv.org/pdf/1612.08083.pdf There are 3 modes for constructing batches: - token block: fill each sample with a specified number of tokens without regard for sentence delimiters - this is what was used for training in the paper - complete: fill each sample with a specified number of tokens but make sure it contains only complete sentences (i.e. if next sentence goes over token block limit, move it to the next sample) - this was used for evaluation in the paper - eos: one sentence per sample (skip blank lines) some results: GCNN-13 - GBW - 37.46 GCNN-14B - GBW - 33.88 GCNN-8 - Wiki103 - 43.76 GCNN-14 - Wiki103 - 35.66 train: python train.py /private/home/abaevski/data/wiki103 --save-dir /tmp --fp16 --max-epoch 35 --save-interval 1 --save-interval-updates 1000 --keep-interval-updates 25 --arch fconv_lm --optimizer nag --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 --decoder-embed-dim 280 --decoder-layers '[(850, 6)] * 3 + [(850,1)] + [(850,5)] * 4 + [(850,1)] + [(850,4)] * 3 + [(1024,4)] + [(2048, 4)]' --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion cross_entropy --max-tokens 1024 --max-target-positions 1024 --seed 1 --log-format json --log-interval 500 eval: python eval_lm.py ~abaevski/data/wiki103 --path '/checkpoint02/abaevski/2018-04-27/lm_wiki.fp16.mxup300000.fconv.adam.lrs=reduce_lr_on_plateau.emb280.layers(850,6)*3+(850,1)+(850,5)*4+(850,1)+(850,4)*3+(1024,1)+(2048,4).lr0.0005.clp0.1.drp0.3.wd0.0.crt=cross_entropy.mxtk2048.smptk256.seed1.ngpu8/checkpoint_last.pt'
2018-05-25 16:43:37 +03:00
def get_normalized_probs(self, net_output, log_probs, _):
# the decoder returns probabilities directly
probs = net_output[0]
if log_probs:
return probs.log()
else:
return probs
def max_positions(self):
return self.args.max_decoder_positions
class TestReshapingEncoder(FairseqEncoder):
def __init__(self, args, dictionary):
super().__init__(dictionary)
self.args = args
def forward(self, src_tokens, src_lengths=None, **kwargs):
b_sz, t_sz = src_tokens.shape
padding_needed = t_sz % 2
x = src_tokens
if padding_needed > 0:
padding_needed = 2 - padding_needed
x = F.pad(x, (0, padding_needed))
return EncoderOut(
encoder_out=x.view(b_sz, -1, 2),
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
def reorder_encoder_out(self, encoder_out, new_order):
return EncoderOut(
encoder_out=encoder_out.encoder_out.index_select(0, new_order),
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
class TestReshapingModel(FairseqEncoderDecoderModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@classmethod
def build_model(cls, args, task):
encoder = TestReshapingEncoder(args, task.source_dictionary)
decoder = TestIncrementalDecoder(args, task.target_dictionary)
return cls(encoder, decoder)
class TestAdditionalInputEncoder(FairseqEncoder):
def __init__(self, args, dictionary):
super().__init__(dictionary)
self.args = args
def forward(self, src_tokens, src_lengths=None, **kwargs):
assert "fancy_other_input" in kwargs
assert kwargs["fancy_other_input"] is not None
return EncoderOut(
encoder_out=src_tokens,
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
def reorder_encoder_out(self, encoder_out, new_order):
return EncoderOut(
encoder_out=encoder_out.encoder_out.index_select(0, new_order),
encoder_padding_mask=None,
encoder_embedding=None,
encoder_states=None,
src_tokens=None,
src_lengths=None,
)
class TestAdditionalInputModel(FairseqEncoderDecoderModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@classmethod
def build_model(cls, args, task):
encoder = TestAdditionalInputEncoder(args, task.source_dictionary)
decoder = TestIncrementalDecoder(args, task.target_dictionary)
return cls(encoder, decoder)
def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
decoder_out = self.decoder(
prev_output_tokens, encoder_out=encoder_out, **kwargs
)
return decoder_out
def train_language_model(
data_dir,
arch,
extra_flags=None,
run_validation=False,
extra_valid_flags=None,
task="language_modeling",
world_size=1,
):
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(
train_parser,
[
"--task",
task,
data_dir,
"--arch",
arch,
"--optimizer",
"adam",
"--lr",
"0.0001",
"--max-tokens",
"500",
"--tokens-per-sample",
"500",
"--save-dir",
data_dir,
"--max-epoch",
"1",
"--no-progress-bar",
"--distributed-world-size",
str(world_size),
"--ddp-backend",
"no_c10d",
"--num-workers",
"0",
]
+ (extra_flags or []),
)
cfg = convert_namespace_to_omegaconf(train_args)
distributed_utils.call_main(cfg, train.main)
if run_validation:
# test validation
validate_parser = options.get_validation_parser()
validate_args = options.parse_args_and_arch(
validate_parser,
[
"--task",
task,
data_dir,
"--path",
os.path.join(data_dir, "checkpoint_last.pt"),
"--valid-subset",
"valid",
"--max-tokens",
"500",
"--no-progress-bar",
"--num-workers",
"0",
]
+ (extra_valid_flags or []),
)
validate.main(validate_args)
def sizes(data):
return [len(sentence) for sentence in data]
POPULATION = string.ascii_letters + string.digits
def make_sentence() -> tp.List[str]:
length = random.randint(10, 50)
return random.choices(
population=POPULATION, k=length, weights=range(1, len(POPULATION) + 1)
)
def make_data(length=1000, out_file=None) -> tp.List[tp.List[str]]:
data = (
[make_sentence() for _ in range(0, length)]
# add all the symbols at least once
+ [list(string.ascii_letters), list(string.digits)]
)
if out_file is not None:
with open(out_file, "w", encoding="utf-8") as out:
for s in data:
print(" ".join(s), file=out)
return data
add masked_lm test (#4344) Summary: # Before submitting - [X] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [X] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)? - [X] Did you make sure to update the docs? - [X] Did you write any new necessary tests? ## What does this PR do? Fixes https://github.com/pytorch/fairseq/issues/4300 ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Big time! Note: I had to update `black` because of [this known issue](https://github.com/psf/black/issues/2964): ``` black....................................................................Failed - hook id: black - exit code: 1 Traceback (most recent call last): File "/Users/azzhipa/.cache/pre-commit/repoxt83whf2/py_env-python3.8/bin/black", line 8, in <module> sys.exit(patched_main()) File "/Users/azzhipa/.cache/pre-commit/repoxt83whf2/py_env-python3.8/lib/python3.8/site-packages/black/__init__.py", line 1423, in patched_main patch_click() File "/Users/azzhipa/.cache/pre-commit/repoxt83whf2/py_env-python3.8/lib/python3.8/site-packages/black/__init__.py", line 1409, in patch_click from click import _unicodefun ImportError: cannot import name '_unicodefun' from 'click' (/Users/azzhipa/.cache/pre-commit/repoxt83whf2/py_env-python3.8/lib/python3.8/site-packages/click/__init__.py) ``` Pull Request resolved: https://github.com/pytorch/fairseq/pull/4344 Reviewed By: zhengwy888 Differential Revision: D35691648 Pulled By: dianaml0 fbshipit-source-id: 4bdf408bc9d9cca76c9c08e138cf85b1d00d14d4
2022-04-19 00:47:00 +03:00
def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary:
d = Dictionary()
for s in data:
for token in s:
d.add_symbol(token)
d.finalize()
return d