From cfbf0dddbc2f06b4d2975655a3959d13e5ba6667 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Thu, 21 Jan 2021 07:32:08 -0800 Subject: [PATCH] Small changes to make tests more reliable (#1572) Summary: After this, `python setup.py test` should be more reliable (including when multiple GPUs are present) Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1572 Reviewed By: alexeib Differential Revision: D25984113 Pulled By: myleott fbshipit-source-id: 7fef27ae90c079c07f592ed9fb350ccf8b56d23d --- README.md | 4 ++-- fairseq/data/data_utils_fast.pyx | 3 +++ fairseq/distributed_utils.py | 5 +---- fairseq/models/roberta/hub_interface.py | 2 +- fairseq/ngram_repeat_block.py | 2 +- fairseq/sequence_generator.py | 12 ++++++---- setup.py | 29 +++++++++++++------------ tests/distributed/utils.py | 1 + tests/test_binaries.py | 5 +++-- tests/test_bmuf.py | 17 ++++++--------- 10 files changed, 42 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index c22abba8c..5fedac7ee 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,8 @@ pip install --editable ./ # on MacOS: # CFLAGS="-stdlib=libc++" pip install --editable ./ -# to install the latest stable release (0.10.1) -# pip install fairseq==0.10.1 +# to install the latest stable release (0.10.x) +# pip install fairseq ``` * **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library: diff --git a/fairseq/data/data_utils_fast.pyx b/fairseq/data/data_utils_fast.pyx index d197d3f00..c61f31d6b 100644 --- a/fairseq/data/data_utils_fast.pyx +++ b/fairseq/data/data_utils_fast.pyx @@ -24,6 +24,9 @@ cpdef list batch_by_size_vec( int64_t max_sentences, int32_t bsz_mult, ): + if indices.shape[0] == 0: + return [] + assert max_tokens <= 0 or np.max(num_tokens_vec) <= max_tokens, ( f"Sentences lengths should not exceed max_tokens={max_tokens}" ) diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py index 79deba5b9..37822362d 100644 --- a/fairseq/distributed_utils.py +++ b/fairseq/distributed_utils.py @@ -301,10 +301,6 @@ def distributed_main(i, main, cfg: FairseqConfig, kwargs): main(cfg, **kwargs) - # make sure checkpoints finish saving - if torch.distributed.is_initialized(): - torch.distributed.barrier() - def call_main(cfg: FairseqConfig, main, **kwargs): if cfg.distributed_training.distributed_init_method is None: @@ -323,6 +319,7 @@ def call_main(cfg: FairseqConfig, main, **kwargs): torch.cuda.device_count(), cfg.distributed_training.distributed_world_size, ), + join=True, ) else: distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs) diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py index 0c723f06d..c9af434bd 100644 --- a/fairseq/models/roberta/hub_interface.py +++ b/fairseq/models/roberta/hub_interface.py @@ -173,7 +173,7 @@ class RobertaHubInterface(nn.Module): add_if_not_exist=False, ) - masked_index = (tokens == self.task.mask_idx).nonzero() + masked_index = (tokens == self.task.mask_idx).nonzero(as_tuple=False) if tokens.dim() == 1: tokens = tokens.unsqueeze(0) diff --git a/fairseq/ngram_repeat_block.py b/fairseq/ngram_repeat_block.py index 856c9e64f..ed2d74463 100644 --- a/fairseq/ngram_repeat_block.py +++ b/fairseq/ngram_repeat_block.py @@ -19,7 +19,7 @@ except ImportError: def is_cuda_extension_usable() -> bool: """Check whether ngram_repeat_block_cuda is built properly""" - if not EXTENSION_BUILT: + if not EXTENSION_BUILT or not torch.cuda.is_available(): return False bsz = 2 tokens = torch.tensor([[4, 4, 3, 2], [1, 2, 3, 4]], dtype=torch.long, device="cuda") diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py index b0249888c..117c6116f 100644 --- a/fairseq/sequence_generator.py +++ b/fairseq/sequence_generator.py @@ -86,8 +86,10 @@ class SequenceGenerator(nn.Module): self.temperature = temperature self.match_source_len = match_source_len - self.no_repeat_ngram_size = no_repeat_ngram_size - self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) + if no_repeat_ngram_size > 0: + self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) + else: + self.repeat_ngram_blocker = None assert temperature > 0, "--temperature must be greater than 0" @@ -373,8 +375,10 @@ class SequenceGenerator(nn.Module): if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) - if self.no_repeat_ngram_size > 0: - lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) + if self.repeat_ngram_blocker is not None: + lprobs = self.repeat_ngram_blocker( + tokens, lprobs, bsz, beam_size, step + ) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( diff --git a/setup.py b/setup.py index 08fe0dccc..d1a976104 100644 --- a/setup.py +++ b/setup.py @@ -242,18 +242,19 @@ def get_files(path, relative_to="fairseq"): return all_files -try: - # symlink examples into fairseq package so package_data accepts them - fairseq_examples = os.path.join("fairseq", "examples") - if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples): - os.symlink(os.path.join("..", "examples"), fairseq_examples) +if __name__ == "__main__": + try: + # symlink examples into fairseq package so package_data accepts them + fairseq_examples = os.path.join("fairseq", "examples") + if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples): + os.symlink(os.path.join("..", "examples"), fairseq_examples) - package_data = { - "fairseq": ( - get_files(fairseq_examples) + get_files(os.path.join("fairseq", "config")) - ) - } - do_setup(package_data) -finally: - if "build_ext" not in sys.argv[1:] and os.path.exists(fairseq_examples): - os.unlink(fairseq_examples) + package_data = { + "fairseq": ( + get_files(fairseq_examples) + get_files(os.path.join("fairseq", "config")) + ) + } + do_setup(package_data) + finally: + if "build_ext" not in sys.argv[1:] and os.path.exists(fairseq_examples): + os.unlink(fairseq_examples) diff --git a/tests/distributed/utils.py b/tests/distributed/utils.py index d2b3ddb1f..c8040392a 100644 --- a/tests/distributed/utils.py +++ b/tests/distributed/utils.py @@ -17,6 +17,7 @@ def spawn_and_init(fn, world_size, args=None): fn=functools.partial(init_and_run, fn, args), args=(world_size, tmp_file.name,), nprocs=world_size, + join=True, ) diff --git a/tests/test_binaries.py b/tests/test_binaries.py index ddfc1c4db..967b1bf7b 100644 --- a/tests/test_binaries.py +++ b/tests/test_binaries.py @@ -1623,8 +1623,9 @@ class TestActivationCheckpointing(unittest.TestCase): with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir: - create_dummy_data(data_dir, num_examples=20) - preprocess_translation_data(data_dir) + with self.assertLogs(): + create_dummy_data(data_dir, num_examples=20) + preprocess_translation_data(data_dir) ckpt_logs = _train(["--checkpoint-activations"]) baseline_logs = _train([]) assert len(baseline_logs) == len(ckpt_logs) diff --git a/tests/test_bmuf.py b/tests/test_bmuf.py index e7aa6da1c..785da37bc 100644 --- a/tests/test_bmuf.py +++ b/tests/test_bmuf.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import argparse +import functools import random import unittest from multiprocessing import Manager @@ -141,16 +142,12 @@ class TestBMUF(unittest.TestCase): def bmuf_process(self, cfg, args, iterations): processes = [] results = Manager().dict() - ctx = torch.multiprocessing.get_context("spawn") - for rank in range(args.distributed_world_size): - p = ctx.Process( - target=single_gpu_training, args=(cfg, args, rank, iterations, results) - ) - p.start() - processes.append(p) - - for p in processes: - p.join() + torch.multiprocessing.spawn( + fn=functools.partial(single_gpu_training, cfg, args), + args=(iterations, results), + nprocs=args.distributed_world_size, + join=True, + ) return results def test_bmuf_sync(self):