mirror of
https://github.com/facebookresearch/fairseq.git
synced 2024-10-26 17:32:57 +03:00
c5ff181125
Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/2460 Reviewed By: myleott Differential Revision: D31731798 Pulled By: sshleifer fbshipit-source-id: 938456c17aa004cacffdcdd124aebe390da83d5f
1899 lines
71 KiB
Python
1899 lines
71 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
import contextlib
|
|
import logging
|
|
import json
|
|
import os
|
|
import random
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from io import StringIO
|
|
from typing import List, Dict
|
|
import torch
|
|
from fairseq import options
|
|
from fairseq_cli import eval_lm, train
|
|
from tests.utils import (
|
|
create_dummy_data,
|
|
generate_main,
|
|
preprocess_lm_data,
|
|
preprocess_summarization_data,
|
|
preprocess_translation_data,
|
|
create_laser_data_and_config_json,
|
|
train_translation_model,
|
|
train_language_model,
|
|
)
|
|
|
|
|
|
try:
|
|
import transformers # noqa
|
|
|
|
has_hf_transformers = True
|
|
except ImportError:
|
|
has_hf_transformers = False
|
|
|
|
|
|
class TestTranslation(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.disable(logging.CRITICAL)
|
|
|
|
def tearDown(self):
|
|
logging.disable(logging.NOTSET)
|
|
|
|
def test_fconv(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_fconv") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(data_dir, "fconv_iwslt_de_en")
|
|
generate_main(data_dir)
|
|
|
|
def test_raw(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_fconv_raw") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, ["--dataset-impl", "raw"])
|
|
train_translation_model(
|
|
data_dir, "fconv_iwslt_de_en", ["--dataset-impl", "raw"]
|
|
)
|
|
generate_main(data_dir, ["--dataset-impl", "raw"])
|
|
|
|
def test_update_freq(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_update_freq") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir, "fconv_iwslt_de_en", ["--update-freq", "3"]
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_max_positions(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_max_positions") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
with self.assertRaises(Exception) as context:
|
|
train_translation_model(
|
|
data_dir,
|
|
"fconv_iwslt_de_en",
|
|
["--max-target-positions", "5"],
|
|
)
|
|
self.assertTrue(
|
|
"skip this example with --skip-invalid-size-inputs-valid-test"
|
|
in str(context.exception)
|
|
)
|
|
train_translation_model(
|
|
data_dir,
|
|
"fconv_iwslt_de_en",
|
|
[
|
|
"--max-target-positions",
|
|
"5",
|
|
"--skip-invalid-size-inputs-valid-test",
|
|
],
|
|
)
|
|
with self.assertRaises(Exception) as context:
|
|
generate_main(data_dir)
|
|
generate_main(data_dir, ["--skip-invalid-size-inputs-valid-test"])
|
|
|
|
def test_generation(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_sampling") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(data_dir, "fconv_iwslt_de_en")
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--sampling",
|
|
"--temperature",
|
|
"2",
|
|
"--beam",
|
|
"2",
|
|
"--nbest",
|
|
"2",
|
|
],
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--sampling",
|
|
"--sampling-topk",
|
|
"3",
|
|
"--beam",
|
|
"2",
|
|
"--nbest",
|
|
"2",
|
|
],
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--sampling",
|
|
"--sampling-topp",
|
|
"0.2",
|
|
"--beam",
|
|
"2",
|
|
"--nbest",
|
|
"2",
|
|
],
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--diversity-rate",
|
|
"0.5",
|
|
"--beam",
|
|
"6",
|
|
],
|
|
)
|
|
with self.assertRaises(ValueError):
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--diverse-beam-groups",
|
|
"4",
|
|
"--match-source-len",
|
|
],
|
|
)
|
|
generate_main(data_dir, ["--prefix-size", "2"])
|
|
generate_main(data_dir, ["--retain-dropout"])
|
|
|
|
def test_eval_bleu(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_eval_bleu") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"fconv_iwslt_de_en",
|
|
[
|
|
"--eval-bleu",
|
|
"--eval-bleu-print-samples",
|
|
"--eval-bleu-remove-bpe",
|
|
"--eval-bleu-detok",
|
|
"space",
|
|
"--eval-bleu-args",
|
|
'{"beam": 4, "min_len": 10}',
|
|
],
|
|
)
|
|
|
|
def test_lstm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lstm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"lstm_wiseman_iwslt_de_en",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--decoder-out-embed-dim",
|
|
"8",
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_lstm_bidirectional(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lstm_bidirectional") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"lstm",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--encoder-bidirectional",
|
|
"--encoder-hidden-size",
|
|
"16",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--decoder-out-embed-dim",
|
|
"8",
|
|
"--decoder-layers",
|
|
"2",
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_transformer") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_iwslt_de_en",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
],
|
|
run_validation=True,
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_multilingual_transformer(self):
|
|
# test with all combinations of encoder/decoder lang tokens
|
|
encoder_langtok_flags = [
|
|
[],
|
|
["--encoder-langtok", "src"],
|
|
["--encoder-langtok", "tgt"],
|
|
]
|
|
decoder_langtok_flags = [[], ["--decoder-langtok"]]
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
for i in range(len(encoder_langtok_flags)):
|
|
for j in range(len(decoder_langtok_flags)):
|
|
enc_ltok_flag = encoder_langtok_flags[i]
|
|
dec_ltok_flag = decoder_langtok_flags[j]
|
|
with tempfile.TemporaryDirectory(
|
|
f"test_multilingual_transformer_{i}_{j}"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="multilingual_transformer",
|
|
task="multilingual_translation",
|
|
extra_flags=[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
lang_flags=["--lang-pairs", "in-out,out-in"],
|
|
run_validation=True,
|
|
extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--task",
|
|
"multilingual_translation",
|
|
"--lang-pairs",
|
|
"in-out,out-in",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
)
|
|
|
|
@unittest.skipIf(
|
|
sys.platform.lower() == "darwin", "skip latent depth test on MacOS"
|
|
)
|
|
def test_multilingual_translation_latent_depth(self):
|
|
# test with latent depth in encoder, decoder, or both
|
|
encoder_latent_layer = [[], ["--encoder-latent-layer"]]
|
|
decoder_latent_layer = [[], ["--decoder-latent-layer"]]
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
for i in range(len(encoder_latent_layer)):
|
|
for j in range(len(decoder_latent_layer)):
|
|
if i == 0 and j == 0:
|
|
continue
|
|
enc_ll_flag = encoder_latent_layer[i]
|
|
dec_ll_flag = decoder_latent_layer[j]
|
|
with tempfile.TemporaryDirectory(
|
|
f"test_multilingual_translation_latent_depth_{i}_{j}"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(
|
|
data_dir, extra_flags=["--joined-dictionary"]
|
|
)
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="latent_multilingual_transformer",
|
|
task="multilingual_translation_latent_depth",
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/latent_depth/latent_depth_src",
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--share-encoders",
|
|
"--share-decoders",
|
|
"--sparsity-weight",
|
|
"0.1",
|
|
]
|
|
+ enc_ll_flag
|
|
+ dec_ll_flag,
|
|
lang_flags=["--lang-pairs", "in-out,out-in"],
|
|
run_validation=True,
|
|
extra_valid_flags=[
|
|
"--user-dir",
|
|
"examples/latent_depth/latent_depth_src",
|
|
]
|
|
+ enc_ll_flag
|
|
+ dec_ll_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/latent_depth/latent_depth_src",
|
|
"--task",
|
|
"multilingual_translation_latent_depth",
|
|
"--lang-pairs",
|
|
"in-out,out-in",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ll_flag
|
|
+ dec_ll_flag,
|
|
)
|
|
|
|
def test_translation_multi_simple_epoch(self):
|
|
# test with all combinations of encoder/decoder lang tokens
|
|
encoder_langtok_flags = [
|
|
[],
|
|
["--encoder-langtok", "src"],
|
|
["--encoder-langtok", "tgt"],
|
|
]
|
|
decoder_langtok_flags = [[], ["--decoder-langtok"]]
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
for i in range(len(encoder_langtok_flags)):
|
|
for j in range(len(decoder_langtok_flags)):
|
|
enc_ltok_flag = encoder_langtok_flags[i]
|
|
dec_ltok_flag = decoder_langtok_flags[j]
|
|
with tempfile.TemporaryDirectory(
|
|
f"test_translation_multi_simple_epoch_{i}_{j}"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(
|
|
data_dir, extra_flags=["--joined-dictionary"]
|
|
)
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="transformer",
|
|
task="translation_multi_simple_epoch",
|
|
extra_flags=[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--sampling-method",
|
|
"temperature",
|
|
"--sampling-temperature",
|
|
"1.5",
|
|
"--virtual-epoch-size",
|
|
"1000",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
lang_flags=["--lang-pairs", "in-out,out-in"],
|
|
run_validation=True,
|
|
extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--task",
|
|
"translation_multi_simple_epoch",
|
|
"--lang-pairs",
|
|
"in-out,out-in",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
)
|
|
|
|
def test_translation_multi_simple_epoch_no_vepoch(self):
|
|
# test with all combinations of encoder/decoder lang tokens
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
enc_ltok_flag = ["--encoder-langtok", "src"]
|
|
dec_ltok_flag = ["--decoder-langtok"]
|
|
with tempfile.TemporaryDirectory(
|
|
"test_translation_multi_simple_epoch_dict"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, extra_flags=[])
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="transformer",
|
|
task="translation_multi_simple_epoch",
|
|
extra_flags=[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--sampling-method",
|
|
"temperature",
|
|
"--sampling-temperature",
|
|
"1.5",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
lang_flags=["--lang-pairs", "in-out"],
|
|
run_validation=True,
|
|
extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--task",
|
|
"translation_multi_simple_epoch",
|
|
"--lang-pairs",
|
|
"in-out",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
)
|
|
|
|
def test_translation_multi_simple_epoch_dicts(self):
|
|
# test with all combinations of encoder/decoder lang tokens
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
enc_ltok_flag = ["--encoder-langtok", "src"]
|
|
dec_ltok_flag = ["--decoder-langtok"]
|
|
with tempfile.TemporaryDirectory(
|
|
"test_translation_multi_simple_epoch_dict"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, extra_flags=[])
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="transformer",
|
|
task="translation_multi_simple_epoch",
|
|
extra_flags=[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--sampling-method",
|
|
"temperature",
|
|
"--sampling-temperature",
|
|
"1.5",
|
|
"--virtual-epoch-size",
|
|
"1000",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
lang_flags=["--lang-pairs", "in-out"],
|
|
run_validation=True,
|
|
extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--task",
|
|
"translation_multi_simple_epoch",
|
|
"--lang-pairs",
|
|
"in-out",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
)
|
|
|
|
def test_translation_multi_simple_epoch_src_tgt_dict_spec(self):
|
|
# test the specification of explicit --src-dict and --tgt-dict
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
enc_ltok_flag = ["--encoder-langtok", "src"]
|
|
dec_ltok_flag = ["--decoder-langtok"]
|
|
with tempfile.TemporaryDirectory(
|
|
"test_translation_multi_simple_epoch_dict"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, extra_flags=[])
|
|
train_translation_model(
|
|
data_dir,
|
|
arch="transformer",
|
|
task="translation_multi_simple_epoch",
|
|
extra_flags=[
|
|
"--source-dict",
|
|
f"{data_dir}/dict.in.txt",
|
|
"--target-dict",
|
|
f"{data_dir}/dict.out.txt",
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--sampling-method",
|
|
"temperature",
|
|
"--sampling-temperature",
|
|
"1.5",
|
|
"--virtual-epoch-size",
|
|
"1000",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
lang_flags=["--lang-pairs", "in-out"],
|
|
run_validation=True,
|
|
extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--task",
|
|
"translation_multi_simple_epoch",
|
|
"--lang-pairs",
|
|
"in-out",
|
|
"--source-lang",
|
|
"in",
|
|
"--target-lang",
|
|
"out",
|
|
]
|
|
+ enc_ltok_flag
|
|
+ dec_ltok_flag,
|
|
)
|
|
|
|
def test_transformer_cross_self_attention(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_transformer_cross_self_attention"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_iwslt_de_en",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--no-cross-attention",
|
|
"--cross-self-attention",
|
|
],
|
|
run_validation=True,
|
|
)
|
|
generate_main(data_dir, extra_flags=[])
|
|
|
|
def test_transformer_pointer_generator(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_transformer_pointer_generator"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_summarization_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_pointer_generator",
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/pointer_generator/pointer_generator_src",
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--alignment-layer",
|
|
"-1",
|
|
"--alignment-heads",
|
|
"1",
|
|
"--source-position-markers",
|
|
"0",
|
|
],
|
|
run_validation=True,
|
|
extra_valid_flags=[
|
|
"--user-dir",
|
|
"examples/pointer_generator/pointer_generator_src",
|
|
],
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/pointer_generator/pointer_generator_src",
|
|
],
|
|
)
|
|
|
|
def test_lightconv(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lightconv") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"lightconv_iwslt_de_en",
|
|
[
|
|
"--encoder-conv-type",
|
|
"lightweight",
|
|
"--decoder-conv-type",
|
|
"lightweight",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_dynamicconv(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_dynamicconv") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"lightconv_iwslt_de_en",
|
|
[
|
|
"--encoder-conv-type",
|
|
"dynamic",
|
|
"--decoder-conv-type",
|
|
"dynamic",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_cmlm_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_cmlm_transformer") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, ["--joined-dictionary"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"cmlm_transformer",
|
|
[
|
|
"--apply-bert-init",
|
|
"--criterion",
|
|
"nat_loss",
|
|
"--noise",
|
|
"full_mask",
|
|
"--pred-length-offset",
|
|
"--length-loss-factor",
|
|
"0.1",
|
|
],
|
|
task="translation_lev",
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"translation_lev",
|
|
"--iter-decode-max-iter",
|
|
"9",
|
|
"--iter-decode-eos-penalty",
|
|
"0",
|
|
"--print-step",
|
|
],
|
|
)
|
|
|
|
def test_nonautoregressive_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_nonautoregressive_transformer"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, ["--joined-dictionary"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"nonautoregressive_transformer",
|
|
[
|
|
"--apply-bert-init",
|
|
"--src-embedding-copy",
|
|
"--criterion",
|
|
"nat_loss",
|
|
"--noise",
|
|
"full_mask",
|
|
"--pred-length-offset",
|
|
"--length-loss-factor",
|
|
"0.1",
|
|
],
|
|
task="translation_lev",
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"translation_lev",
|
|
"--iter-decode-max-iter",
|
|
"0",
|
|
"--iter-decode-eos-penalty",
|
|
"0",
|
|
"--print-step",
|
|
],
|
|
)
|
|
|
|
# def test_nat_crf_transformer(self):
|
|
# with contextlib.redirect_stdout(StringIO()):
|
|
# with tempfile.TemporaryDirectory('test_nat_crf_transformer') as data_dir:
|
|
# create_dummy_data(data_dir)
|
|
# preprocess_translation_data(data_dir, ['--joined-dictionary'])
|
|
# train_translation_model(data_dir, 'nacrf_transformer', [
|
|
# '--apply-bert-init', '--criterion',
|
|
# 'nat_loss', '--noise', 'full_mask', '--pred-length-offset',
|
|
# '--length-loss-factor', '0.1',
|
|
# '--word-ins-loss-factor', '0.5',
|
|
# '--crf-lowrank-approx', '1',
|
|
# '--crf-beam-approx', '1'
|
|
# ], task='translation_lev')
|
|
# generate_main(data_dir, [
|
|
# '--task', 'translation_lev',
|
|
# '--iter-decode-max-iter', '0',
|
|
# '--iter-decode-eos-penalty', '0',
|
|
# '--print-step',
|
|
# ])
|
|
|
|
def test_iterative_nonautoregressive_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_iterative_nonautoregressive_transformer"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, ["--joined-dictionary"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"iterative_nonautoregressive_transformer",
|
|
[
|
|
"--apply-bert-init",
|
|
"--src-embedding-copy",
|
|
"--criterion",
|
|
"nat_loss",
|
|
"--noise",
|
|
"full_mask",
|
|
"--stochastic-approx",
|
|
"--dae-ratio",
|
|
"0.5",
|
|
"--train-step",
|
|
"3",
|
|
],
|
|
task="translation_lev",
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"translation_lev",
|
|
"--iter-decode-max-iter",
|
|
"9",
|
|
"--iter-decode-eos-penalty",
|
|
"0",
|
|
"--print-step",
|
|
],
|
|
)
|
|
|
|
def test_insertion_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_insertion_transformer") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir, ["--joined-dictionary"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"insertion_transformer",
|
|
[
|
|
"--apply-bert-init",
|
|
"--criterion",
|
|
"nat_loss",
|
|
"--noise",
|
|
"random_mask",
|
|
],
|
|
task="translation_lev",
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"translation_lev",
|
|
"--iter-decode-max-iter",
|
|
"9",
|
|
"--iter-decode-eos-penalty",
|
|
"0",
|
|
"--print-step",
|
|
],
|
|
)
|
|
|
|
def test_mixture_of_experts(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_moe") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_iwslt_de_en",
|
|
[
|
|
"--task",
|
|
"translation_moe",
|
|
"--user-dir",
|
|
"examples/translation_moe/translation_moe_src",
|
|
"--method",
|
|
"hMoElp",
|
|
"--mean-pool-gating-network",
|
|
"--num-experts",
|
|
"3",
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
],
|
|
)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"translation_moe",
|
|
"--user-dir",
|
|
"examples/translation_moe/translation_moe_src",
|
|
"--method",
|
|
"hMoElp",
|
|
"--mean-pool-gating-network",
|
|
"--num-experts",
|
|
"3",
|
|
"--gen-expert",
|
|
"0",
|
|
],
|
|
)
|
|
|
|
def test_alignment(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_alignment") as data_dir:
|
|
create_dummy_data(data_dir, alignment=True)
|
|
preprocess_translation_data(data_dir, ["--align-suffix", "align"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_align",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--load-alignments",
|
|
"--alignment-layer",
|
|
"1",
|
|
"--criterion",
|
|
"label_smoothed_cross_entropy_with_alignment",
|
|
],
|
|
run_validation=True,
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_laser_lstm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_laser_lstm") as data_dir:
|
|
laser_config_file = create_laser_data_and_config_json(data_dir)
|
|
train_translation_model(
|
|
laser_config_file.name,
|
|
"laser_lstm",
|
|
[
|
|
"--user-dir",
|
|
"examples/laser/laser_src",
|
|
"--weighting-alpha",
|
|
"0.3",
|
|
"--encoder-bidirectional",
|
|
"--encoder-hidden-size",
|
|
"512",
|
|
"--encoder-layers",
|
|
"5",
|
|
"--decoder-layers",
|
|
"1",
|
|
"--encoder-embed-dim",
|
|
"320",
|
|
"--decoder-embed-dim",
|
|
"320",
|
|
"--decoder-lang-embed-dim",
|
|
"32",
|
|
"--save-dir",
|
|
data_dir,
|
|
"--disable-validation",
|
|
],
|
|
task="laser",
|
|
lang_flags=[],
|
|
)
|
|
|
|
def test_laser_transformer(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_laser_transformer") as data_dir:
|
|
laser_config_file = create_laser_data_and_config_json(data_dir)
|
|
train_translation_model(
|
|
laser_config_file.name,
|
|
"laser_transformer",
|
|
[
|
|
"--user-dir",
|
|
"examples/laser/laser_src",
|
|
"--weighting-alpha",
|
|
"0.3",
|
|
"--encoder-embed-dim",
|
|
"320",
|
|
"--decoder-embed-dim",
|
|
"320",
|
|
"--decoder-lang-embed-dim",
|
|
"32",
|
|
"--save-dir",
|
|
data_dir,
|
|
"--disable-validation",
|
|
],
|
|
task="laser",
|
|
lang_flags=[],
|
|
)
|
|
|
|
def test_alignment_full_context(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_alignment") as data_dir:
|
|
create_dummy_data(data_dir, alignment=True)
|
|
preprocess_translation_data(data_dir, ["--align-suffix", "align"])
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_align",
|
|
[
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--load-alignments",
|
|
"--alignment-layer",
|
|
"1",
|
|
"--criterion",
|
|
"label_smoothed_cross_entropy_with_alignment",
|
|
"--full-context-alignment",
|
|
],
|
|
run_validation=True,
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
def test_transformer_layerdrop(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_transformer_layerdrop") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_iwslt_de_en",
|
|
[
|
|
"--encoder-layers",
|
|
"3",
|
|
"--decoder-layers",
|
|
"3",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--encoder-layerdrop",
|
|
"0.01",
|
|
"--decoder-layerdrop",
|
|
"0.01",
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--model-overrides",
|
|
"{'encoder_layers_to_keep':'0,2','decoder_layers_to_keep':'1'}",
|
|
],
|
|
)
|
|
|
|
|
|
class TestStories(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.disable(logging.CRITICAL)
|
|
|
|
def tearDown(self):
|
|
logging.disable(logging.NOTSET)
|
|
|
|
def test_fconv_self_att_wp(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_fconv_self_att_wp") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_translation_data(data_dir)
|
|
config = [
|
|
"--encoder-layers",
|
|
"[(128, 3)] * 2",
|
|
"--decoder-layers",
|
|
"[(128, 3)] * 2",
|
|
"--decoder-attention",
|
|
"True",
|
|
"--encoder-attention",
|
|
"False",
|
|
"--gated-attention",
|
|
"True",
|
|
"--self-attention",
|
|
"True",
|
|
"--project-input",
|
|
"True",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--decoder-out-embed-dim",
|
|
"8",
|
|
"--multihead-self-attention-nheads",
|
|
"2",
|
|
]
|
|
train_translation_model(data_dir, "fconv_self_att_wp", config)
|
|
generate_main(data_dir)
|
|
|
|
# fusion model
|
|
os.rename(
|
|
os.path.join(data_dir, "checkpoint_last.pt"),
|
|
os.path.join(data_dir, "pretrained.pt"),
|
|
)
|
|
config.extend(
|
|
[
|
|
"--pretrained",
|
|
"True",
|
|
"--pretrained-checkpoint",
|
|
os.path.join(data_dir, "pretrained.pt"),
|
|
"--save-dir",
|
|
os.path.join(data_dir, "fusion_model"),
|
|
]
|
|
)
|
|
train_translation_model(data_dir, "fconv_self_att_wp", config)
|
|
|
|
|
|
class TestLanguageModeling(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.disable(logging.CRITICAL)
|
|
|
|
def tearDown(self):
|
|
logging.disable(logging.NOTSET)
|
|
|
|
def test_fconv_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_fconv_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"fconv_lm",
|
|
[
|
|
"--decoder-layers",
|
|
"[(850, 3)] * 2 + [(1024,4)]",
|
|
"--decoder-embed-dim",
|
|
"280",
|
|
"--optimizer",
|
|
"nag",
|
|
"--lr",
|
|
"0.1",
|
|
],
|
|
)
|
|
eval_lm_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
def test_transformer_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"transformer_lm",
|
|
["--add-bos-token", '--nval', '1'],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
eval_lm_main(data_dir, extra_flags=["--context-window", "25"])
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
def test_normformer_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"transformer_lm",
|
|
["--add-bos-token", '--nval', '1', '--scale-fc', '--scale-heads', '--scale-attn', '--scale-fc'],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
eval_lm_main(data_dir, extra_flags=["--context-window", "25"])
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
def test_transformer_lm_with_adaptive_softmax(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_transformer_lm_with_adaptive_softmax"
|
|
) as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"transformer_lm",
|
|
[
|
|
"--add-bos-token",
|
|
"--criterion",
|
|
"adaptive_loss",
|
|
"--adaptive-softmax-cutoff",
|
|
"5,10,15",
|
|
],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
def test_lightconv_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lightconv_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"lightconv_lm",
|
|
["--add-bos-token"],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
def test_lstm_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lstm_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"lstm_lm",
|
|
["--add-bos-token"],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
def test_lstm_lm_residuals(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_lstm_lm_residuals") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_language_model(
|
|
data_dir,
|
|
"lstm_lm",
|
|
["--add-bos-token", "--residuals"],
|
|
run_validation=True,
|
|
)
|
|
eval_lm_main(data_dir)
|
|
generate_main(
|
|
data_dir,
|
|
[
|
|
"--task",
|
|
"language_modeling",
|
|
"--sample-break-mode",
|
|
"eos",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
],
|
|
)
|
|
|
|
@unittest.skipIf(not has_hf_transformers, "skip test if transformers is missing")
|
|
def test_transformer_xl_bptt_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_transformer_xl_bptt_lm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
task_flags = [
|
|
"--user-dir",
|
|
"examples/truncated_bptt",
|
|
"--task",
|
|
"truncated_bptt_lm",
|
|
"--batch-size",
|
|
"2",
|
|
"--tokens-per-sample",
|
|
"50",
|
|
]
|
|
train_language_model(
|
|
data_dir=data_dir,
|
|
arch="transformer_xl",
|
|
extra_flags=task_flags
|
|
+ [
|
|
"--n-layer",
|
|
"2",
|
|
],
|
|
task="truncated_bptt_lm",
|
|
run_validation=True,
|
|
extra_valid_flags=task_flags,
|
|
)
|
|
eval_lm_main(data_dir, extra_flags=task_flags)
|
|
# Train with activation offloading
|
|
train_language_model(
|
|
data_dir=data_dir,
|
|
arch="transformer_xl",
|
|
extra_flags=task_flags
|
|
+ [
|
|
"--n-layer",
|
|
"2",
|
|
"--offload-activations",
|
|
],
|
|
task="truncated_bptt_lm",
|
|
run_validation=True,
|
|
extra_valid_flags=task_flags,
|
|
)
|
|
|
|
|
|
class TestMaskedLanguageModel(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.disable(logging.CRITICAL)
|
|
|
|
def tearDown(self):
|
|
logging.disable(logging.NOTSET)
|
|
|
|
def test_legacy_masked_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_legacy_mlm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_legacy_masked_language_model(data_dir, "masked_lm")
|
|
|
|
def test_roberta_masked_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_roberta_mlm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_masked_lm(
|
|
data_dir, "roberta_base", extra_flags=["--encoder-layers", "2"]
|
|
)
|
|
|
|
def test_roberta_sentence_prediction(self):
|
|
num_classes = 3
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_roberta_head") as data_dir:
|
|
create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
preprocess_lm_data(os.path.join(data_dir, "label"))
|
|
train_roberta_head(data_dir, "roberta_base", num_classes=num_classes)
|
|
|
|
def test_roberta_regression_single(self):
|
|
num_classes = 1
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_roberta_regression_single"
|
|
) as data_dir:
|
|
create_dummy_roberta_head_data(
|
|
data_dir, num_classes=num_classes, regression=True
|
|
)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=["--regression-target"],
|
|
)
|
|
|
|
def test_roberta_regression_multiple(self):
|
|
num_classes = 3
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_roberta_regression_multiple"
|
|
) as data_dir:
|
|
create_dummy_roberta_head_data(
|
|
data_dir, num_classes=num_classes, regression=True
|
|
)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=["--regression-target"],
|
|
)
|
|
|
|
def test_linformer_roberta_masked_lm(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_linformer_roberta_mlm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_masked_lm(
|
|
data_dir,
|
|
"linformer_roberta_base",
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/linformer/linformer_src",
|
|
"--encoder-layers",
|
|
"2",
|
|
],
|
|
)
|
|
|
|
def test_linformer_roberta_sentence_prediction(self):
|
|
num_classes = 3
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_linformer_roberta_head") as data_dir:
|
|
create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
preprocess_lm_data(os.path.join(data_dir, "label"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"linformer_roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=["--user-dir", "examples/linformer/linformer_src"],
|
|
)
|
|
|
|
def test_linformer_roberta_regression_single(self):
|
|
num_classes = 1
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_linformer_roberta_regression_single"
|
|
) as data_dir:
|
|
create_dummy_roberta_head_data(
|
|
data_dir, num_classes=num_classes, regression=True
|
|
)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"linformer_roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=[
|
|
"--regression-target",
|
|
"--user-dir",
|
|
"examples/linformer/linformer_src",
|
|
],
|
|
)
|
|
|
|
def test_linformer_roberta_regression_multiple(self):
|
|
num_classes = 3
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory(
|
|
"test_linformer_roberta_regression_multiple"
|
|
) as data_dir:
|
|
create_dummy_roberta_head_data(
|
|
data_dir, num_classes=num_classes, regression=True
|
|
)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"linformer_roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=[
|
|
"--regression-target",
|
|
"--user-dir",
|
|
"examples/linformer/linformer_src",
|
|
],
|
|
)
|
|
|
|
def _test_pretrained_masked_lm_for_translation(self, learned_pos_emb, encoder_only):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_mlm") as data_dir:
|
|
create_dummy_data(data_dir)
|
|
preprocess_lm_data(data_dir)
|
|
train_legacy_masked_language_model(
|
|
data_dir,
|
|
arch="masked_lm",
|
|
extra_args=("--encoder-learned-pos",) if learned_pos_emb else (),
|
|
)
|
|
with tempfile.TemporaryDirectory(
|
|
"test_mlm_translation"
|
|
) as translation_dir:
|
|
create_dummy_data(translation_dir)
|
|
preprocess_translation_data(
|
|
translation_dir, extra_flags=["--joined-dictionary"]
|
|
)
|
|
# Train transformer with data_dir/checkpoint_last.pt
|
|
train_translation_model(
|
|
translation_dir,
|
|
arch="transformer_from_pretrained_xlm",
|
|
extra_flags=[
|
|
"--decoder-layers",
|
|
"1",
|
|
"--decoder-embed-dim",
|
|
"32",
|
|
"--decoder-attention-heads",
|
|
"1",
|
|
"--decoder-ffn-embed-dim",
|
|
"32",
|
|
"--encoder-layers",
|
|
"1",
|
|
"--encoder-embed-dim",
|
|
"32",
|
|
"--encoder-attention-heads",
|
|
"1",
|
|
"--encoder-ffn-embed-dim",
|
|
"32",
|
|
"--pretrained-xlm-checkpoint",
|
|
"{}/checkpoint_last.pt".format(data_dir),
|
|
"--activation-fn",
|
|
"gelu",
|
|
"--max-source-positions",
|
|
"500",
|
|
"--max-target-positions",
|
|
"500",
|
|
]
|
|
+ (
|
|
["--encoder-learned-pos", "--decoder-learned-pos"]
|
|
if learned_pos_emb
|
|
else []
|
|
)
|
|
+ (["--init-encoder-only"] if encoder_only else []),
|
|
task="translation_from_pretrained_xlm",
|
|
)
|
|
|
|
def test_pretrained_masked_lm_for_translation_learned_pos_emb(self):
|
|
self._test_pretrained_masked_lm_for_translation(True, False)
|
|
|
|
def test_pretrained_masked_lm_for_translation_sinusoidal_pos_emb(self):
|
|
self._test_pretrained_masked_lm_for_translation(False, False)
|
|
|
|
def test_pretrained_masked_lm_for_translation_encoder_only(self):
|
|
self._test_pretrained_masked_lm_for_translation(True, True)
|
|
|
|
def test_r4f_roberta(self):
|
|
num_classes = 3
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_r4f_roberta_head") as data_dir:
|
|
create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
|
|
preprocess_lm_data(os.path.join(data_dir, "input0"))
|
|
preprocess_lm_data(os.path.join(data_dir, "label"))
|
|
train_roberta_head(
|
|
data_dir,
|
|
"roberta_base",
|
|
num_classes=num_classes,
|
|
extra_flags=[
|
|
"--user-dir",
|
|
"examples/rxf/rxf_src",
|
|
"--criterion",
|
|
"sentence_prediction_r3f",
|
|
"--spectral-norm-classification-head",
|
|
],
|
|
)
|
|
|
|
|
|
def train_legacy_masked_language_model(data_dir, arch, extra_args=()):
|
|
train_parser = options.get_training_parser()
|
|
# TODO: langs should be in and out right?
|
|
train_args = options.parse_args_and_arch(
|
|
train_parser,
|
|
[
|
|
"--task",
|
|
"cross_lingual_lm",
|
|
data_dir,
|
|
"--arch",
|
|
arch,
|
|
# Optimizer args
|
|
"--optimizer",
|
|
"adam",
|
|
"--lr-scheduler",
|
|
"reduce_lr_on_plateau",
|
|
"--lr-shrink",
|
|
"0.5",
|
|
"--lr",
|
|
"0.0001",
|
|
"--stop-min-lr",
|
|
"1e-09",
|
|
# dropout, attention args
|
|
"--dropout",
|
|
"0.1",
|
|
"--attention-dropout",
|
|
"0.1",
|
|
# MLM args
|
|
"--criterion",
|
|
"legacy_masked_lm_loss",
|
|
"--masked-lm-only",
|
|
"--monolingual-langs",
|
|
"in,out",
|
|
"--num-segment",
|
|
"5",
|
|
# Transformer args: use a small transformer model for fast training
|
|
"--encoder-layers",
|
|
"1",
|
|
"--encoder-embed-dim",
|
|
"32",
|
|
"--encoder-attention-heads",
|
|
"1",
|
|
"--encoder-ffn-embed-dim",
|
|
"32",
|
|
# Other training args
|
|
"--max-tokens",
|
|
"500",
|
|
"--tokens-per-sample",
|
|
"500",
|
|
"--save-dir",
|
|
data_dir,
|
|
"--max-epoch",
|
|
"1",
|
|
"--no-progress-bar",
|
|
"--distributed-world-size",
|
|
"1",
|
|
"--dataset-impl",
|
|
"raw",
|
|
"--num-workers",
|
|
"0",
|
|
]
|
|
+ list(extra_args),
|
|
)
|
|
train.main(train_args)
|
|
|
|
|
|
class TestOptimizers(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.disable(logging.CRITICAL)
|
|
|
|
def tearDown(self):
|
|
logging.disable(logging.NOTSET)
|
|
|
|
def test_optimizers(self):
|
|
with contextlib.redirect_stdout(StringIO()):
|
|
with tempfile.TemporaryDirectory("test_optimizers") as data_dir:
|
|
# Use just a bit of data and tiny model to keep this test runtime reasonable
|
|
create_dummy_data(data_dir, num_examples=10, maxlen=5)
|
|
preprocess_translation_data(data_dir)
|
|
optimizers = ["adafactor", "adam", "nag", "adagrad", "sgd", "adadelta"]
|
|
last_checkpoint = os.path.join(data_dir, "checkpoint_last.pt")
|
|
for optimizer in optimizers:
|
|
if os.path.exists(last_checkpoint):
|
|
os.remove(last_checkpoint)
|
|
train_translation_model(
|
|
data_dir,
|
|
"lstm",
|
|
[
|
|
"--required-batch-size-multiple",
|
|
"1",
|
|
"--encoder-layers",
|
|
"1",
|
|
"--encoder-hidden-size",
|
|
"32",
|
|
"--decoder-layers",
|
|
"1",
|
|
"--optimizer",
|
|
optimizer,
|
|
],
|
|
)
|
|
generate_main(data_dir)
|
|
|
|
|
|
def read_last_log_entry(
|
|
logs: List[logging.LogRecord], logger_name: str
|
|
) -> Dict[str, float]:
|
|
for x in reversed(logs):
|
|
if x.name == logger_name:
|
|
return json.loads(x.message)
|
|
raise ValueError(f"No entries from {logger_name} found in captured logs")
|
|
|
|
|
|
class TestActivationCheckpointing(unittest.TestCase):
|
|
base_flags = [
|
|
"--encoder-layers",
|
|
"2",
|
|
"--decoder-layers",
|
|
"2",
|
|
"--encoder-embed-dim",
|
|
"8",
|
|
"--decoder-embed-dim",
|
|
"8",
|
|
"--restore-file",
|
|
"x.pt",
|
|
"--log-format",
|
|
"json",
|
|
"--log-interval",
|
|
"1",
|
|
"--max-update",
|
|
"2",
|
|
]
|
|
|
|
def _train(self, data_dir, extra_flags):
|
|
with self.assertLogs() as logs:
|
|
train_translation_model(
|
|
data_dir,
|
|
"transformer_iwslt_de_en",
|
|
self.base_flags + extra_flags,
|
|
run_validation=True,
|
|
extra_valid_flags=["--log-format", "json"],
|
|
)
|
|
return logs.records
|
|
|
|
def test_activation_offloading_does_not_change_metrics(self):
|
|
"""Neither ----checkpoint-activations nor --offload-activations should change loss"""
|
|
with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir:
|
|
|
|
with self.assertLogs():
|
|
create_dummy_data(data_dir, num_examples=20)
|
|
preprocess_translation_data(data_dir)
|
|
offload_logs = self._train(data_dir, ["--offload-activations"])
|
|
baseline_logs = self._train(data_dir, [])
|
|
|
|
assert len(baseline_logs) == len(offload_logs)
|
|
|
|
baseline_valid_stats = read_last_log_entry(baseline_logs, "valid")
|
|
offload_valid_stats = read_last_log_entry(offload_logs, "valid")
|
|
baseline_train_stats = read_last_log_entry(baseline_logs, "train")
|
|
offload_train_stats = read_last_log_entry(offload_logs, "train")
|
|
|
|
assert (
|
|
baseline_train_stats["train_loss"] == offload_train_stats["train_loss"]
|
|
)
|
|
assert (
|
|
baseline_valid_stats["valid_loss"] == offload_valid_stats["valid_loss"]
|
|
)
|
|
|
|
def test_activation_checkpointing_does_not_change_metrics(self):
|
|
"""--checkpoint-activations should not change loss"""
|
|
|
|
with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir:
|
|
with self.assertLogs():
|
|
create_dummy_data(data_dir, num_examples=20)
|
|
preprocess_translation_data(data_dir)
|
|
ckpt_logs = self._train(data_dir, ["--checkpoint-activations"])
|
|
baseline_logs = self._train(data_dir, [])
|
|
assert len(baseline_logs) == len(ckpt_logs)
|
|
|
|
baseline_train_stats = read_last_log_entry(baseline_logs, "train")
|
|
ckpt_train_stats = read_last_log_entry(ckpt_logs, "train")
|
|
assert baseline_train_stats["train_loss"] == ckpt_train_stats["train_loss"]
|
|
|
|
baseline_valid_stats = read_last_log_entry(baseline_logs, "valid")
|
|
ckpt_valid_stats = read_last_log_entry(ckpt_logs, "valid")
|
|
assert baseline_valid_stats["valid_loss"] == ckpt_valid_stats["valid_loss"]
|
|
|
|
|
|
def create_dummy_roberta_head_data(
|
|
data_dir, num_examples=100, maxlen=10, num_classes=2, regression=False
|
|
):
|
|
input_dir = "input0"
|
|
|
|
def _create_dummy_data(filename):
|
|
random_data = torch.rand(num_examples * maxlen)
|
|
input_data = 97 + torch.floor(26 * random_data).int()
|
|
if regression:
|
|
output_data = torch.rand((num_examples, num_classes))
|
|
else:
|
|
output_data = 1 + torch.floor(num_classes * torch.rand(num_examples)).int()
|
|
with open(os.path.join(data_dir, input_dir, filename + ".out"), "w") as f_in:
|
|
label_filename = filename + ".label" if regression else filename + ".out"
|
|
with open(os.path.join(data_dir, "label", label_filename), "w") as f_out:
|
|
offset = 0
|
|
for i in range(num_examples):
|
|
# write example input
|
|
ex_len = random.randint(1, maxlen)
|
|
ex_str = " ".join(map(chr, input_data[offset : offset + ex_len]))
|
|
print(ex_str, file=f_in)
|
|
# write example label
|
|
if regression:
|
|
class_str = " ".join(map(str, output_data[i].numpy()))
|
|
print(class_str, file=f_out)
|
|
else:
|
|
class_str = "class{}".format(output_data[i])
|
|
print(class_str, file=f_out)
|
|
offset += ex_len
|
|
|
|
os.mkdir(os.path.join(data_dir, input_dir))
|
|
os.mkdir(os.path.join(data_dir, "label"))
|
|
_create_dummy_data("train")
|
|
_create_dummy_data("valid")
|
|
_create_dummy_data("test")
|
|
|
|
|
|
def train_masked_lm(data_dir, arch, extra_flags=None):
|
|
train_parser = options.get_training_parser()
|
|
train_args = options.parse_args_and_arch(
|
|
train_parser,
|
|
[
|
|
"--task",
|
|
"masked_lm",
|
|
data_dir,
|
|
"--arch",
|
|
arch,
|
|
"--optimizer",
|
|
"adam",
|
|
"--lr",
|
|
"0.0001",
|
|
"--criterion",
|
|
"masked_lm",
|
|
"--batch-size",
|
|
"500",
|
|
"--save-dir",
|
|
data_dir,
|
|
"--max-epoch",
|
|
"1",
|
|
"--no-progress-bar",
|
|
"--distributed-world-size",
|
|
"1",
|
|
"--ddp-backend",
|
|
"no_c10d",
|
|
"--num-workers",
|
|
"0",
|
|
]
|
|
+ (extra_flags or []),
|
|
)
|
|
train.main(train_args)
|
|
|
|
|
|
def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None):
|
|
train_parser = options.get_training_parser()
|
|
train_args = options.parse_args_and_arch(
|
|
train_parser,
|
|
[
|
|
"--task",
|
|
"sentence_prediction",
|
|
data_dir,
|
|
"--arch",
|
|
arch,
|
|
"--encoder-layers",
|
|
"2",
|
|
"--num-classes",
|
|
str(num_classes),
|
|
"--optimizer",
|
|
"adam",
|
|
"--lr",
|
|
"0.0001",
|
|
"--criterion",
|
|
"sentence_prediction",
|
|
"--max-tokens",
|
|
"500",
|
|
"--max-positions",
|
|
"500",
|
|
"--batch-size",
|
|
"500",
|
|
"--save-dir",
|
|
data_dir,
|
|
"--max-epoch",
|
|
"1",
|
|
"--no-progress-bar",
|
|
"--distributed-world-size",
|
|
"1",
|
|
"--ddp-backend",
|
|
"no_c10d",
|
|
"--num-workers",
|
|
"0",
|
|
]
|
|
+ (extra_flags or []),
|
|
)
|
|
train.main(train_args)
|
|
|
|
|
|
def eval_lm_main(data_dir, extra_flags=None):
|
|
eval_lm_parser = options.get_eval_lm_parser()
|
|
eval_lm_args = options.parse_args_and_arch(
|
|
eval_lm_parser,
|
|
[
|
|
data_dir,
|
|
"--path",
|
|
os.path.join(data_dir, "checkpoint_last.pt"),
|
|
"--no-progress-bar",
|
|
"--num-workers",
|
|
"0",
|
|
]
|
|
+ (extra_flags or []),
|
|
)
|
|
eval_lm.main(eval_lm_args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|