Misc fixes (#2786)

Summary: - Rename type -> key in fairseq/tasks/sentence_prediction.py (fixes https://github.com/pytorch/fairseq/issues/2746) - Update preprocessing docs (fixes https://github.com/pytorch/fairseq/issues/2565) - Turn off logging in test_fp16_optimizer.TestGradientScaling - Documentation updates - Remove some unused code - Fix noisychannel example (fixes https://github.com/pytorch/fairseq/issues/2213) Pull Request resolved: https://github.com/pytorch/fairseq/pull/2786 Reviewed By: shruti-bh Differential Revision: D24515146 Pulled By: myleott fbshipit-source-id: 86b0f5516c57610fdca801c60e58158ef052fc3a
2024-08-16 20:10:40 +03:00 · 2020-10-27 11:24:58 -07:00 · 2020-10-27 11:24:58 -07:00 · 1bc83c703a
commit 1bc83c703a
parent 01be083e46
13 changed files with 37 additions and 28 deletions
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@ -170,13 +170,14 @@ The easiest way to launch jobs is with the `torch.distributed.launch

 For example, to train a large English-German Transformer model on 2 nodes each
 with 8 GPUs (in total 16 GPUs), run the following command on each node,
-replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
+replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
+sure to update ``--master_addr`` to the IP address of the first node:

 .. code-block:: console

    > python -m torch.distributed.launch --nproc_per_node=8 \
        --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
-        --master_port=1234 \
+        --master_port=12345 \
        $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
        --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
        --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
@ -184,7 +185,15 @@ replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
        --lr 0.0005 --min-lr 1e-09 \
        --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
        --max-tokens 3584 \
-        --fp16  --distributed-no-spawn 
+        --fp16
+
+On SLURM clusters, fairseq will automatically detect the number of nodes and
+GPUs, but a port number must be provided:
+
+.. code-block:: console
+
+    > salloc --gpus=16 --nodes 2 (...)
+    > srun fairseq-train --distributed-port 12345 (...).

 Sharding very large datasets
 ----------------------------
--- a/examples/noisychannel/rerank.py
+++ b/examples/noisychannel/rerank.py
@ -11,7 +11,7 @@ from fairseq import options
 from fairseq.data import dictionary
 from fairseq.scoring import bleu

-from . import (
+from examples.noisychannel import (
    rerank_generate,
    rerank_options,
    rerank_score_bw,
--- a/examples/noisychannel/rerank_generate.py
+++ b/examples/noisychannel/rerank_generate.py
@ -15,7 +15,7 @@ from contextlib import redirect_stdout
 from fairseq import options
 from fairseq_cli import generate, preprocess

-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils


 def gen_and_reprocess_nbest(args):
--- a/examples/noisychannel/rerank_score_bw.py
+++ b/examples/noisychannel/rerank_score_bw.py
@ -9,7 +9,7 @@ from contextlib import redirect_stdout
 from fairseq import options
 from fairseq_cli import generate

-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils


 def score_bw(args):
--- a/examples/noisychannel/rerank_score_lm.py
+++ b/examples/noisychannel/rerank_score_lm.py
@ -7,7 +7,7 @@ import os

 from fairseq import options

-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils


 def score_lm(args):
--- a/examples/noisychannel/rerank_tune.py
+++ b/examples/noisychannel/rerank_tune.py
@ -9,7 +9,7 @@ import random
 import numpy as np
 from fairseq import options

-from . import rerank, rerank_options
+from examples.noisychannel import rerank, rerank_options


 def random_search(args):
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
 - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
 - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
- Finetuning on SQuAD: coming soon

 ## Pretraining using your own data

--- a/fairseq/dataclass/configs.py
+++ b/fairseq/dataclass/configs.py
@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass):
    batch_size_valid: Optional[int] = field(
        default=None,
        metadata={
-            "help": "batch size of the validation batch" " (defaults to --batch-size)",
+            "help": "batch size of the validation batch (defaults to --batch-size)",
            "argparse_alias": "--max-sentences-valid",
        },
    )
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder):

    def __init__(self, args, dictionary):
        super().__init__(dictionary)
+
+        # set any missing default values
+        base_architecture(args)
        self.args = args

        if args.encoder_layers_to_keep:
@ -417,7 +420,6 @@ class RobertaEncoder(FairseqEncoder):
            q_noise=args.quant_noise_pq,
            qn_block_size=args.quant_noise_pq_block_size,
        )
-        args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)

        self.lm_head = RobertaLMHead(
            embed_dim=args.encoder_embed_dim,
@ -495,6 +497,7 @@ def base_architecture(args):
    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
+    args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
    args.spectral_norm_classification_head = getattr(
        args, "spectral_norm_classification_head", False
    )
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@ -144,7 +144,6 @@ class TransformerEncoderLayer(nn.Module):
        residual = x
        if self.normalize_before:
            x = self.final_layer_norm(x)
-
        x = self.activation_fn(self.fc1(x))
        x = self.activation_dropout_module(x)
        x = self.fc2(x)
@ -413,11 +412,3 @@ class TransformerDecoderLayer(nn.Module):

    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
        self.need_attn = need_attn
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
--- a/fairseq/options.py
+++ b/fairseq/options.py
@ -249,11 +249,13 @@ def add_preprocess_args(parser):
    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref", metavar="FP", default=None,
-                       help="train file prefix")
+                       help="train file prefix (also used to build dictionaries)")
    group.add_argument("--validpref", metavar="FP", default=None,
-                       help="comma separated, valid file prefixes")
+                       help="comma separated, valid file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
    group.add_argument("--testpref", metavar="FP", default=None,
-                       help="comma separated, test file prefixes")
+                       help="comma separated, test file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
    group.add_argument("--align-suffix", metavar="FP", default=None,
                       help="alignment file suffix")
    group.add_argument("--destdir", metavar="DIR", default="data-bin",
--- a/fairseq/tasks/sentence_prediction.py
+++ b/fairseq/tasks/sentence_prediction.py
@ -135,11 +135,11 @@ class SentencePredictionTask(LegacyFairseqTask):
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""

-        def get_path(type, split):
-            return os.path.join(self.args.data, type, split)
+        def get_path(key, split):
+            return os.path.join(self.args.data, key, split)

-        def make_dataset(type, dictionary):
-            split_path = get_path(type, split)
+        def make_dataset(key, dictionary):
+            split_path = get_path(key, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
@ -151,7 +151,7 @@ class SentencePredictionTask(LegacyFairseqTask):

        input0 = make_dataset("input0", self.source_dictionary)
        assert input0 is not None, "could not find dataset: {}".format(
-            get_path(type, split)
+            get_path("input0", split)
        )
        input1 = make_dataset("input1", self.source_dictionary)

--- a/tests/test_fp16_optimizer.py
+++ b/tests/test_fp16_optimizer.py
@ -5,6 +5,7 @@

 import argparse
 import copy
+import logging
 import unittest

 import torch
@ -46,6 +47,10 @@ class TestGradientScaling(unittest.TestCase):
                },
            }
        )
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)

    def run_iter(self, model, params, optimizer):
        optimizer.zero_grad()