Misc fixes (#2786)

Summary:
- Rename type -> key in fairseq/tasks/sentence_prediction.py (fixes https://github.com/pytorch/fairseq/issues/2746)
- Update preprocessing docs (fixes https://github.com/pytorch/fairseq/issues/2565)
- Turn off logging in test_fp16_optimizer.TestGradientScaling
- Documentation updates
- Remove some unused code
- Fix noisychannel example (fixes https://github.com/pytorch/fairseq/issues/2213)

Pull Request resolved: https://github.com/pytorch/fairseq/pull/2786

Reviewed By: shruti-bh

Differential Revision: D24515146

Pulled By: myleott

fbshipit-source-id: 86b0f5516c57610fdca801c60e58158ef052fc3a
This commit is contained in:
Myle Ott 2020-10-27 11:24:58 -07:00 committed by Facebook GitHub Bot
parent 01be083e46
commit 1bc83c703a
13 changed files with 37 additions and 28 deletions

View File

@ -170,13 +170,14 @@ The easiest way to launch jobs is with the `torch.distributed.launch
For example, to train a large English-German Transformer model on 2 nodes each
with 8 GPUs (in total 16 GPUs), run the following command on each node,
replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
sure to update ``--master_addr`` to the IP address of the first node:
.. code-block:: console
> python -m torch.distributed.launch --nproc_per_node=8 \
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
--master_port=1234 \
--master_port=12345 \
$(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
@ -184,7 +185,15 @@ replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
--lr 0.0005 --min-lr 1e-09 \
--dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 3584 \
--fp16 --distributed-no-spawn
--fp16
On SLURM clusters, fairseq will automatically detect the number of nodes and
GPUs, but a port number must be provided:
.. code-block:: console
> salloc --gpus=16 --nodes 2 (...)
> srun fairseq-train --distributed-port 12345 (...).
Sharding very large datasets
----------------------------

View File

@ -11,7 +11,7 @@ from fairseq import options
from fairseq.data import dictionary
from fairseq.scoring import bleu
from . import (
from examples.noisychannel import (
rerank_generate,
rerank_options,
rerank_score_bw,

View File

@ -15,7 +15,7 @@ from contextlib import redirect_stdout
from fairseq import options
from fairseq_cli import generate, preprocess
from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils
def gen_and_reprocess_nbest(args):

View File

@ -9,7 +9,7 @@ from contextlib import redirect_stdout
from fairseq import options
from fairseq_cli import generate
from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils
def score_bw(args):

View File

@ -7,7 +7,7 @@ import os
from fairseq import options
from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils
def score_lm(args):

View File

@ -9,7 +9,7 @@ import random
import numpy as np
from fairseq import options
from . import rerank, rerank_options
from examples.noisychannel import rerank, rerank_options
def random_search(args):

View File

@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
- Finetuning on SQuAD: coming soon
## Pretraining using your own data

View File

@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass):
batch_size_valid: Optional[int] = field(
default=None,
metadata={
"help": "batch size of the validation batch" " (defaults to --batch-size)",
"help": "batch size of the validation batch (defaults to --batch-size)",
"argparse_alias": "--max-sentences-valid",
},
)

View File

@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder):
def __init__(self, args, dictionary):
super().__init__(dictionary)
# set any missing default values
base_architecture(args)
self.args = args
if args.encoder_layers_to_keep:
@ -417,7 +420,6 @@ class RobertaEncoder(FairseqEncoder):
q_noise=args.quant_noise_pq,
qn_block_size=args.quant_noise_pq_block_size,
)
args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
self.lm_head = RobertaLMHead(
embed_dim=args.encoder_embed_dim,
@ -495,6 +497,7 @@ def base_architecture(args):
args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
args.spectral_norm_classification_head = getattr(
args, "spectral_norm_classification_head", False
)

View File

@ -144,7 +144,6 @@ class TransformerEncoderLayer(nn.Module):
residual = x
if self.normalize_before:
x = self.final_layer_norm(x)
x = self.activation_fn(self.fc1(x))
x = self.activation_dropout_module(x)
x = self.fc2(x)
@ -413,11 +412,3 @@ class TransformerDecoderLayer(nn.Module):
def make_generation_fast_(self, need_attn: bool = False, **kwargs):
self.need_attn = need_attn
def Linear(in_features, out_features, bias=True):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)
if bias:
nn.init.constant_(m.bias, 0.0)
return m

View File

@ -249,11 +249,13 @@ def add_preprocess_args(parser):
group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
help="target language")
group.add_argument("--trainpref", metavar="FP", default=None,
help="train file prefix")
help="train file prefix (also used to build dictionaries)")
group.add_argument("--validpref", metavar="FP", default=None,
help="comma separated, valid file prefixes")
help="comma separated, valid file prefixes "
"(words missing from train set are replaced with <unk>)")
group.add_argument("--testpref", metavar="FP", default=None,
help="comma separated, test file prefixes")
help="comma separated, test file prefixes "
"(words missing from train set are replaced with <unk>)")
group.add_argument("--align-suffix", metavar="FP", default=None,
help="alignment file suffix")
group.add_argument("--destdir", metavar="DIR", default="data-bin",

View File

@ -135,11 +135,11 @@ class SentencePredictionTask(LegacyFairseqTask):
def load_dataset(self, split, combine=False, **kwargs):
"""Load a given dataset split (e.g., train, valid, test)."""
def get_path(type, split):
return os.path.join(self.args.data, type, split)
def get_path(key, split):
return os.path.join(self.args.data, key, split)
def make_dataset(type, dictionary):
split_path = get_path(type, split)
def make_dataset(key, dictionary):
split_path = get_path(key, split)
dataset = data_utils.load_indexed_dataset(
split_path,
@ -151,7 +151,7 @@ class SentencePredictionTask(LegacyFairseqTask):
input0 = make_dataset("input0", self.source_dictionary)
assert input0 is not None, "could not find dataset: {}".format(
get_path(type, split)
get_path("input0", split)
)
input1 = make_dataset("input1", self.source_dictionary)

View File

@ -5,6 +5,7 @@
import argparse
import copy
import logging
import unittest
import torch
@ -46,6 +47,10 @@ class TestGradientScaling(unittest.TestCase):
},
}
)
logging.disable(logging.CRITICAL)
def tearDown(self):
logging.disable(logging.NOTSET)
def run_iter(self, model, params, optimizer):
optimizer.zero_grad()