From f2563c21e15e0991b87eb49398234e2f4e809d88 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Wed, 8 May 2019 08:57:56 -0700 Subject: [PATCH] Cleanup LM + Flake8 Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720 Differential Revision: D15259091 Pulled By: myleott fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e --- fairseq/data/__init__.py | 1 + fairseq/data/data_utils.py | 3 +- fairseq/data/dictionary.py | 2 + fairseq/data/iterators.py | 2 - fairseq/data/language_pair_dataset.py | 2 - fairseq/data/masked_lm_dataset.py | 2 +- fairseq/models/__init__.py | 32 ++- fairseq/models/composite_encoder.py | 2 +- fairseq/models/distributed_fairseq_model.py | 5 +- fairseq/models/fairseq_incremental_decoder.py | 4 +- fairseq/models/fairseq_model.py | 3 +- fairseq/models/fconv.py | 106 +------- fairseq/models/fconv_lm.py | 106 ++++++++ fairseq/models/fconv_self_att.py | 20 +- fairseq/models/lightconv.py | 174 +------------ fairseq/models/lightconv_lm.py | 173 +++++++++++++ fairseq/models/lstm.py | 11 +- fairseq/models/masked_lm.py | 7 +- fairseq/models/multilingual_transformer.py | 12 +- fairseq/models/transformer.py | 187 +------------- .../models/transformer_from_pretrained_xlm.py | 3 +- fairseq/models/transformer_lm.py | 241 ++++++++++++++++++ fairseq/modules/dynamic_convolution.py | 4 +- fairseq/modules/highway.py | 3 +- fairseq/modules/lightweight_convolution.py | 6 +- .../transformer_sentence_encoder_layer.py | 10 +- fairseq/optim/__init__.py | 7 + fairseq/tasks/cross_lingual_lm.py | 1 - fairseq/tasks/language_modeling.py | 2 +- fairseq/tasks/multilingual_translation.py | 2 - fairseq/tasks/translation.py | 7 +- fairseq/tasks/translation_moe.py | 4 +- interactive.py | 3 +- train.py | 3 +- 34 files changed, 649 insertions(+), 501 deletions(-) create mode 100644 fairseq/models/fconv_lm.py create mode 100644 fairseq/models/lightconv_lm.py create mode 100644 fairseq/models/transformer_lm.py diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py index c05e03517..e0d433677 100644 --- a/fairseq/data/__init__.py +++ b/fairseq/data/__init__.py @@ -47,4 +47,5 @@ __all__ = [ 'TokenBlockDataset', 'TransformEosDataset', 'TransformEosLangPairDataset', + 'TruncatedDictionary', ] diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py index 5866f6396..ecc4fe123 100644 --- a/fairseq/data/data_utils.py +++ b/fairseq/data/data_utils.py @@ -10,6 +10,7 @@ import os import numpy as np from collections import Iterable + def infer_language_pair(path): """Infer language pair from filename: .-.(...).idx""" src, dst = None, None @@ -182,7 +183,7 @@ def batch_by_size( def process_bpe_symbol(sentence: str, bpe_symbol: str): if bpe_symbol == 'sentencepiece': - sentence = sentence.replace(' ','').replace('\u2581', ' ').strip() + sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip() elif bpe_symbol is not None: sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip() return sentence diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py index 19fa644aa..254013f9a 100644 --- a/fairseq/data/dictionary.py +++ b/fairseq/data/dictionary.py @@ -18,6 +18,7 @@ from fairseq.data import data_utils class Dictionary(object): """A mapping from symbols to consecutive integers""" + def __init__(self, pad='', eos='', unk='', bos=''): self.unk_word, self.pad_word, self.eos_word = unk, pad, eos self.symbols = [] @@ -282,6 +283,7 @@ class Dictionary(object): else: merge_result(Dictionary._add_file_to_dictionary_single_worker(filename, tokenize, dict.eos_word)) + class TruncatedDictionary(object): def __init__(self, wrapped_dict, length): diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py index 060dc6eb7..24c896e46 100644 --- a/fairseq/data/iterators.py +++ b/fairseq/data/iterators.py @@ -7,8 +7,6 @@ import itertools import math -import queue -import threading import numpy as np import torch diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py index f8afed895..351ba1e8f 100644 --- a/fairseq/data/language_pair_dataset.py +++ b/fairseq/data/language_pair_dataset.py @@ -8,8 +8,6 @@ import numpy as np import torch -from fairseq import utils - from . import data_utils, FairseqDataset diff --git a/fairseq/data/masked_lm_dataset.py b/fairseq/data/masked_lm_dataset.py index 865e0b17e..c2ec965b3 100644 --- a/fairseq/data/masked_lm_dataset.py +++ b/fairseq/data/masked_lm_dataset.py @@ -10,7 +10,7 @@ import math import numpy as np import torch -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple from . import FairseqDataset, data_utils diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py index 812ef5cd5..abe7f3025 100644 --- a/fairseq/models/__init__.py +++ b/fairseq/models/__init__.py @@ -9,19 +9,33 @@ import argparse import importlib import os -from .fairseq_decoder import FairseqDecoder # noqa: F401 -from .fairseq_encoder import FairseqEncoder # noqa: F401 -from .fairseq_incremental_decoder import FairseqIncrementalDecoder # noqa: F401 +from .fairseq_decoder import FairseqDecoder +from .fairseq_encoder import FairseqEncoder +from .fairseq_incremental_decoder import FairseqIncrementalDecoder from .fairseq_model import ( BaseFairseqModel, - FairseqModel, # noqa: F401 - FairseqMultiModel, # noqa: F401 - FairseqLanguageModel, # noqa: F401 - FairseqEncoderModel, # noqa: F401 + FairseqModel, + FairseqMultiModel, + FairseqLanguageModel, + FairseqEncoderModel, ) -from .composite_encoder import CompositeEncoder # noqa: F401 -from .distributed_fairseq_model import DistributedFairseqModel # noqa: F401 +from .composite_encoder import CompositeEncoder +from .distributed_fairseq_model import DistributedFairseqModel + + +__all__ = [ + 'BaseFairseqModel', + 'CompositeEncoder', + 'DistributedFairseqModel', + 'FairseqDecoder', + 'FairseqEncoder', + 'FairseqEncoderModel', + 'FairseqIncrementalDecoder', + 'FairseqLanguageModel', + 'FairseqModel', + 'FairseqMultiModel', +] MODEL_REGISTRY = {} diff --git a/fairseq/models/composite_encoder.py b/fairseq/models/composite_encoder.py index 0e7d941f6..d6859c7cb 100644 --- a/fairseq/models/composite_encoder.py +++ b/fairseq/models/composite_encoder.py @@ -5,7 +5,7 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -from . import FairseqEncoder +from fairseq.models import FairseqEncoder class CompositeEncoder(FairseqEncoder): diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py index 360d468b2..25759d391 100644 --- a/fairseq/models/distributed_fairseq_model.py +++ b/fairseq/models/distributed_fairseq_model.py @@ -6,14 +6,11 @@ # can be found in the PATENTS file in the same directory. import inspect -import socket from torch.nn import parallel -from fairseq import distributed_utils from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel - -from . import BaseFairseqModel +from fairseq.models import BaseFairseqModel def DistributedFairseqModel(args, model): diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py index ffec5c149..cb59486da 100644 --- a/fairseq/models/fairseq_incremental_decoder.py +++ b/fairseq/models/fairseq_incremental_decoder.py @@ -5,7 +5,7 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -from . import FairseqDecoder +from fairseq.models import FairseqDecoder class FairseqIncrementalDecoder(FairseqDecoder): @@ -25,7 +25,7 @@ class FairseqIncrementalDecoder(FairseqDecoder): The :class:`FairseqIncrementalDecoder` interface also defines the :func:`reorder_incremental_state` method, which is used during beam search to select and reorder the incremental state based on the selection of beams. - + To learn more about how incremental decoding works, refer to `this blog `_. """ diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py index a4bc59b2f..53edb3c4c 100644 --- a/fairseq/models/fairseq_model.py +++ b/fairseq/models/fairseq_model.py @@ -4,14 +4,15 @@ # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. + from typing import Dict, List, Optional import torch import torch.nn as nn import torch.nn.functional as F -from . import FairseqDecoder, FairseqEncoder from fairseq.data import Dictionary +from fairseq.models import FairseqDecoder, FairseqEncoder class BaseFairseqModel(nn.Module): diff --git a/fairseq/models/fconv.py b/fairseq/models/fconv.py index afe42a67a..50cfd0050 100644 --- a/fairseq/models/fconv.py +++ b/fairseq/models/fconv.py @@ -10,17 +10,19 @@ import torch import torch.nn as nn import torch.nn.functional as F -from fairseq import options, utils +from fairseq import utils +from fairseq.models import ( + FairseqEncoder, + FairseqIncrementalDecoder, + FairseqModel, + register_model, + register_model_architecture, +) from fairseq.modules import ( AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding, LinearizedConvolution, ) -from . import ( - FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, - FairseqLanguageModel, register_model, register_model_architecture, -) - @register_model('fconv') class FConvModel(FairseqModel): @@ -111,58 +113,6 @@ class FConvModel(FairseqModel): return FConvModel(encoder, decoder) -@register_model('fconv_lm') -class FConvLanguageModel(FairseqLanguageModel): - def __init__(self, decoder): - super().__init__(decoder) - - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - parser.add_argument('--dropout', type=float, metavar='D', - help='dropout probability') - parser.add_argument('--decoder-embed-dim', type=int, metavar='N', - help='decoder embedding dimension') - parser.add_argument('--decoder-layers', type=str, metavar='EXPR', - help='decoder layers [(dim, kernel_size), ...]') - parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', - help='decoder output embedding dimension') - parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', - help='comma separated list of adaptive softmax cutoff points. ' - 'Must be used with adaptive_loss criterion') - parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', - help='sets adaptive softmax dropout for the tail projections') - parser.add_argument('--decoder-attention', type=str, metavar='EXPR', - help='decoder attention [True, ...]') - - @classmethod - def build_model(cls, args, task): - """Build a new model instance.""" - # make sure all arguments are present in older models - base_lm_architecture(args) - - if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'): - args.tokens_per_sample = args.max_target_positions - - decoder = FConvDecoder( - dictionary=task.target_dictionary, - embed_dim=args.decoder_embed_dim, - convolutions=eval(args.decoder_layers), - out_embed_dim=args.decoder_embed_dim, - attention=eval(args.decoder_attention), - dropout=args.dropout, - max_positions=args.tokens_per_sample, - share_embed=False, - positional_embeddings=False, - adaptive_softmax_cutoff=( - options.eval_str_list(args.adaptive_softmax_cutoff, type=int) - if args.criterion == 'adaptive_loss' else None - ), - adaptive_softmax_dropout=args.adaptive_softmax_dropout, - ) - return FConvLanguageModel(decoder) - - class FConvEncoder(FairseqEncoder): """ Convolutional encoder consisting of `len(convolutions)` layers. @@ -643,46 +593,6 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): return nn.utils.weight_norm(m, dim=2) -@register_model_architecture('fconv_lm', 'fconv_lm') -def base_lm_architecture(args): - args.dropout = getattr(args, 'dropout', 0.1) - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128) - args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13') - args.decoder_attention = getattr(args, 'decoder_attention', 'False') - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) - args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) - - -@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103') -def fconv_lm_dauphin_wikitext103(args): - layers = '[(850, 6)] * 3' - layers += ' + [(850, 1)] * 1' - layers += ' + [(850, 5)] * 4' - layers += ' + [(850, 1)] * 1' - layers += ' + [(850, 4)] * 3' - layers += ' + [(1024, 4)] * 1' - layers += ' + [(2048, 4)] * 1' - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280) - args.decoder_layers = getattr(args, 'decoder_layers', layers) - args.decoder_attention = getattr(args, 'decoder_attention', 'False') - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000') - base_lm_architecture(args) - - -@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw') -def fconv_lm_dauphin_gbw(args): - layers = '[(512, 5)]' - layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3' - layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3' - layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6' - layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]' - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128) - args.decoder_layers = getattr(args, 'decoder_layers', layers) - args.decoder_attention = getattr(args, 'decoder_attention', 'False') - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000') - base_lm_architecture(args) - - @register_model_architecture('fconv', 'fconv') def base_architecture(args): args.dropout = getattr(args, 'dropout', 0.1) diff --git a/fairseq/models/fconv_lm.py b/fairseq/models/fconv_lm.py new file mode 100644 index 000000000..ef53bf8bc --- /dev/null +++ b/fairseq/models/fconv_lm.py @@ -0,0 +1,106 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from fairseq import options +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.fconv import FConvDecoder + + +@register_model('fconv_lm') +class FConvLanguageModel(FairseqLanguageModel): + def __init__(self, decoder): + super().__init__(decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument('--dropout', type=float, metavar='D', + help='dropout probability') + parser.add_argument('--decoder-embed-dim', type=int, metavar='N', + help='decoder embedding dimension') + parser.add_argument('--decoder-layers', type=str, metavar='EXPR', + help='decoder layers [(dim, kernel_size), ...]') + parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', + help='decoder output embedding dimension') + parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', + help='comma separated list of adaptive softmax cutoff points. ' + 'Must be used with adaptive_loss criterion') + parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', + help='sets adaptive softmax dropout for the tail projections') + parser.add_argument('--decoder-attention', type=str, metavar='EXPR', + help='decoder attention [True, ...]') + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure all arguments are present in older models + base_lm_architecture(args) + + if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'): + args.tokens_per_sample = args.max_target_positions + + decoder = FConvDecoder( + dictionary=task.target_dictionary, + embed_dim=args.decoder_embed_dim, + convolutions=eval(args.decoder_layers), + out_embed_dim=args.decoder_embed_dim, + attention=eval(args.decoder_attention), + dropout=args.dropout, + max_positions=args.tokens_per_sample, + share_embed=False, + positional_embeddings=False, + adaptive_softmax_cutoff=( + options.eval_str_list(args.adaptive_softmax_cutoff, type=int) + if args.criterion == 'adaptive_loss' else None + ), + adaptive_softmax_dropout=args.adaptive_softmax_dropout, + ) + return FConvLanguageModel(decoder) + + +@register_model_architecture('fconv_lm', 'fconv_lm') +def base_lm_architecture(args): + args.dropout = getattr(args, 'dropout', 0.1) + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128) + args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13') + args.decoder_attention = getattr(args, 'decoder_attention', 'False') + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) + args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) + + +@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103') +def fconv_lm_dauphin_wikitext103(args): + layers = '[(850, 6)] * 3' + layers += ' + [(850, 1)] * 1' + layers += ' + [(850, 5)] * 4' + layers += ' + [(850, 1)] * 1' + layers += ' + [(850, 4)] * 3' + layers += ' + [(1024, 4)] * 1' + layers += ' + [(2048, 4)] * 1' + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280) + args.decoder_layers = getattr(args, 'decoder_layers', layers) + args.decoder_attention = getattr(args, 'decoder_attention', 'False') + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000') + base_lm_architecture(args) + + +@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw') +def fconv_lm_dauphin_gbw(args): + layers = '[(512, 5)]' + layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3' + layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3' + layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6' + layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]' + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128) + args.decoder_layers = getattr(args, 'decoder_layers', layers) + args.decoder_attention = getattr(args, 'decoder_attention', 'False') + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000') + base_lm_architecture(args) diff --git a/fairseq/models/fconv_self_att.py b/fairseq/models/fconv_self_att.py index 602706e0e..64431151d 100644 --- a/fairseq/models/fconv_self_att.py +++ b/fairseq/models/fconv_self_att.py @@ -12,14 +12,20 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils -from fairseq.modules import ( - DownsampledMultiHeadAttention, GradMultiply, LayerNorm, - LearnedPositionalEmbedding, LinearizedConvolution, +from fairseq.models import ( + CompositeEncoder, + FairseqDecoder, + FairseqEncoder, + FairseqModel, + register_model, + register_model_architecture, ) - -from . import ( - FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel, - register_model, register_model_architecture, +from fairseq.modules import ( + DownsampledMultiHeadAttention, + GradMultiply, + LayerNorm, + LearnedPositionalEmbedding, + LinearizedConvolution, ) diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py index 0205f15ff..1303608bc 100644 --- a/fairseq/models/lightconv.py +++ b/fairseq/models/lightconv.py @@ -12,15 +12,21 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import options, utils -from fairseq.modules import ( - AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm, - LearnedPositionalEmbedding, MultiheadAttention, SinusoidalPositionalEmbedding, - DynamicConv1dTBC, LightweightConv1dTBC, +from fairseq.models import ( + FairseqEncoder, + FairseqIncrementalDecoder, + FairseqModel, + register_model, + register_model_architecture, ) - -from . import ( - FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel, - FairseqModel, register_model, register_model_architecture, +from fairseq.modules import ( + AdaptiveSoftmax, + DynamicConv1dTBC, + LayerNorm, + LearnedPositionalEmbedding, + LightweightConv1dTBC, + MultiheadAttention, + SinusoidalPositionalEmbedding, ) @@ -171,117 +177,6 @@ class LightConvModel(FairseqModel): return LightConvModel(encoder, decoder) -@register_model('lightconv_lm') -class LightConvLanguageModel(FairseqLanguageModel): - def __init__(self, decoder): - super().__init__(decoder) - - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - parser.add_argument('--dropout', default=0.1, type=float, metavar='D', - help='dropout probability') - parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', - help='dropout probability for attention weights') - parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', - help='dropout probability after ReLU in FFN') - parser.add_argument('--input-dropout', type=float, metavar='D', - help='dropout probability of the inputs') - parser.add_argument('--decoder-embed-dim', type=int, metavar='N', - help='decoder embedding dimension') - parser.add_argument('--decoder-output-dim', type=int, metavar='N', - help='decoder output dimension') - parser.add_argument('--decoder-input-dim', type=int, metavar='N', - help='decoder input dimension') - parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', - help='decoder embedding dimension for FFN') - parser.add_argument('--decoder-layers', type=int, metavar='N', - help='num decoder layers') - parser.add_argument('--decoder-attention-heads', type=int, metavar='N', - help='num decoder attention heads or LightConv/DynamicConv heads') - parser.add_argument('--decoder-normalize-before', default=False, action='store_true', - help='apply layernorm before each decoder block') - parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', - help='comma separated list of adaptive softmax cutoff points. ' - 'Must be used with adaptive_loss criterion') - parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', - help='sets adaptive softmax dropout for the tail projections') - parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', - help='adaptive input factor') - parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', - help='if set, disables positional embeddings (outside self attention)') - parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', - help='share decoder input and output embeddings') - parser.add_argument('--character-embeddings', default=False, action='store_true', - help='if set, uses character embedding convolutions to produce token embeddings') - parser.add_argument('--character-filters', type=str, metavar='LIST', - default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', - help='size of character embeddings') - parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, - help='size of character embeddings') - parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2, - help='number of highway layers for character token embeddder') - parser.add_argument('--adaptive-input', default=False, action='store_true', - help='if set, uses adaptive input') - parser.add_argument('--adaptive-input-factor', type=float, metavar='N', - help='adaptive input factor') - parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', - help='comma separated list of adaptive input cutoff points.') - parser.add_argument('--tie-adaptive-weights', action='store_true', - help='if set, ties the weights of adaptive softmax and adaptive input') - parser.add_argument('--tie-adaptive-proj', action='store_true', - help='if set, ties the projection weights of adaptive softmax and adaptive input') - parser.add_argument('--decoder-learned-pos', action='store_true', - help='use learned positional embeddings in the decoder') - - """LightConv and DynamicConv arguments""" - parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), - help='list of kernel size (default: "[3,7,15,31,31,31]")') - parser.add_argument('--decoder-glu', type=options.eval_bool, - help='glu after in proj') - parser.add_argument('--decoder-conv-type', default='dynamic', type=str, - choices=['dynamic', 'lightweight'], - help='type of convolution') - parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) - parser.add_argument('--weight-dropout', type=float, metavar='D', - help='dropout probability for conv weights') - - @classmethod - def build_model(cls, args, task): - """Build a new model instance.""" - - # make sure all arguments are present in older models - base_lm_architecture(args) - - if not hasattr(args, 'max_source_positions'): - args.max_source_positions = args.tokens_per_sample - if not hasattr(args, 'max_target_positions'): - args.max_target_positions = args.tokens_per_sample - - if args.character_embeddings: - embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters), - args.character_embedding_dim, - args.decoder_embed_dim, - args.char_embedder_highway_layers, - ) - elif args.adaptive_input: - embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, - args.adaptive_input_factor, args.decoder_embed_dim, - options.eval_str_list(args.adaptive_input_cutoff, type=int)) - else: - embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) - - if args.tie_adaptive_weights: - assert args.adaptive_input - assert args.adaptive_input_factor == args.adaptive_softmax_factor - assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( - args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) - assert args.decoder_input_dim == args.decoder_output_dim - - decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False) - return LightConvLanguageModel(decoder) - - class LightConvEncoder(FairseqEncoder): """ LightConv encoder consisting of *args.encoder_layers* layers. Each layer @@ -786,47 +681,6 @@ def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=Fals return m -@register_model_architecture('lightconv_lm', 'lightconv_lm') -def base_lm_architecture(args): - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) - args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048) - args.decoder_layers = getattr(args, 'decoder_layers', 6) - args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) - args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) - args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4) - args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) - - args.character_embeddings = getattr(args, 'character_embeddings', False) - - args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) - args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) - - # The model training is not stable without this - args.decoder_normalize_before = True - - args.adaptive_input = getattr(args, 'adaptive_input', False) - args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4) - args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None) - - args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False) - args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False) - - args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31]) - if len(args.decoder_kernel_size_list) == 1: - args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers - - -@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw') -def lightconv_lm_gbw(args): - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) - args.dropout = getattr(args, 'dropout', 0.1) - args.attention_dropout = getattr(args, 'attention_dropout', 0.1) - args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) - args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) - base_lm_architecture(args) - - @register_model_architecture('lightconv', 'lightconv') def base_architecture(args): args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) diff --git a/fairseq/models/lightconv_lm.py b/fairseq/models/lightconv_lm.py new file mode 100644 index 000000000..801730498 --- /dev/null +++ b/fairseq/models/lightconv_lm.py @@ -0,0 +1,173 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from fairseq import options +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.lightconv import ( + Embedding, + LightConvDecoder, +) +from fairseq.modules import ( + AdaptiveInput, + CharacterTokenEmbedder, +) + + +@register_model('lightconv_lm') +class LightConvLanguageModel(FairseqLanguageModel): + def __init__(self, decoder): + super().__init__(decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument('--dropout', default=0.1, type=float, metavar='D', + help='dropout probability') + parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', + help='dropout probability for attention weights') + parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', + help='dropout probability after ReLU in FFN') + parser.add_argument('--input-dropout', type=float, metavar='D', + help='dropout probability of the inputs') + parser.add_argument('--decoder-embed-dim', type=int, metavar='N', + help='decoder embedding dimension') + parser.add_argument('--decoder-output-dim', type=int, metavar='N', + help='decoder output dimension') + parser.add_argument('--decoder-input-dim', type=int, metavar='N', + help='decoder input dimension') + parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', + help='decoder embedding dimension for FFN') + parser.add_argument('--decoder-layers', type=int, metavar='N', + help='num decoder layers') + parser.add_argument('--decoder-attention-heads', type=int, metavar='N', + help='num decoder attention heads or LightConv/DynamicConv heads') + parser.add_argument('--decoder-normalize-before', default=False, action='store_true', + help='apply layernorm before each decoder block') + parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', + help='comma separated list of adaptive softmax cutoff points. ' + 'Must be used with adaptive_loss criterion') + parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', + help='sets adaptive softmax dropout for the tail projections') + parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', + help='adaptive input factor') + parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', + help='if set, disables positional embeddings (outside self attention)') + parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', + help='share decoder input and output embeddings') + parser.add_argument('--character-embeddings', default=False, action='store_true', + help='if set, uses character embedding convolutions to produce token embeddings') + parser.add_argument('--character-filters', type=str, metavar='LIST', + default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', + help='size of character embeddings') + parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, + help='size of character embeddings') + parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2, + help='number of highway layers for character token embeddder') + parser.add_argument('--adaptive-input', default=False, action='store_true', + help='if set, uses adaptive input') + parser.add_argument('--adaptive-input-factor', type=float, metavar='N', + help='adaptive input factor') + parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', + help='comma separated list of adaptive input cutoff points.') + parser.add_argument('--tie-adaptive-weights', action='store_true', + help='if set, ties the weights of adaptive softmax and adaptive input') + parser.add_argument('--tie-adaptive-proj', action='store_true', + help='if set, ties the projection weights of adaptive softmax and adaptive input') + parser.add_argument('--decoder-learned-pos', action='store_true', + help='use learned positional embeddings in the decoder') + + """LightConv and DynamicConv arguments""" + parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), + help='list of kernel size (default: "[3,7,15,31,31,31]")') + parser.add_argument('--decoder-glu', type=options.eval_bool, + help='glu after in proj') + parser.add_argument('--decoder-conv-type', default='dynamic', type=str, + choices=['dynamic', 'lightweight'], + help='type of convolution') + parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) + parser.add_argument('--weight-dropout', type=float, metavar='D', + help='dropout probability for conv weights') + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_lm_architecture(args) + + if not hasattr(args, 'max_source_positions'): + args.max_source_positions = args.tokens_per_sample + if not hasattr(args, 'max_target_positions'): + args.max_target_positions = args.tokens_per_sample + + if args.character_embeddings: + embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters), + args.character_embedding_dim, + args.decoder_embed_dim, + args.char_embedder_highway_layers, + ) + elif args.adaptive_input: + embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, + args.adaptive_input_factor, args.decoder_embed_dim, + options.eval_str_list(args.adaptive_input_cutoff, type=int)) + else: + embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) + + if args.tie_adaptive_weights: + assert args.adaptive_input + assert args.adaptive_input_factor == args.adaptive_softmax_factor + assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( + args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) + assert args.decoder_input_dim == args.decoder_output_dim + + decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False) + return LightConvLanguageModel(decoder) + + +@register_model_architecture('lightconv_lm', 'lightconv_lm') +def base_lm_architecture(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048) + args.decoder_layers = getattr(args, 'decoder_layers', 6) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) + args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) + args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4) + args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) + + args.character_embeddings = getattr(args, 'character_embeddings', False) + + args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) + args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) + + # The model training is not stable without this + args.decoder_normalize_before = True + + args.adaptive_input = getattr(args, 'adaptive_input', False) + args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4) + args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None) + + args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False) + args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False) + + args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31]) + if len(args.decoder_kernel_size_list) == 1: + args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers + + +@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw') +def lightconv_lm_gbw(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) + base_lm_architecture(args) diff --git a/fairseq/models/lstm.py b/fairseq/models/lstm.py index d13fe064f..26dc1953e 100644 --- a/fairseq/models/lstm.py +++ b/fairseq/models/lstm.py @@ -10,11 +10,14 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import options, utils -from fairseq.modules import AdaptiveSoftmax -from . import ( - FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model, +from fairseq.models import ( + FairseqEncoder, + FairseqIncrementalDecoder, + FairseqModel, + register_model, register_model_architecture, ) +from fairseq.modules import AdaptiveSoftmax @register_model('lstm') @@ -299,7 +302,7 @@ class AttentionLayer(nn.Module): # sum weighted sources x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0) - x = F.tanh(self.output_proj(torch.cat((x, input), dim=1))) + x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1))) return x, attn_scores diff --git a/fairseq/models/masked_lm.py b/fairseq/models/masked_lm.py index 7ab6cc8b9..fb5920f05 100644 --- a/fairseq/models/masked_lm.py +++ b/fairseq/models/masked_lm.py @@ -9,8 +9,11 @@ import torch import torch.nn as nn import torch.nn.functional as F -from . import ( - BaseFairseqModel, FairseqEncoder, register_model, register_model_architecture, +from fairseq.models import ( + BaseFairseqModel, + FairseqEncoder, + register_model, + register_model_architecture, ) from fairseq.modules import ( SinusoidalPositionalEmbedding, diff --git a/fairseq/models/multilingual_transformer.py b/fairseq/models/multilingual_transformer.py index dd61ba919..6e637006a 100644 --- a/fairseq/models/multilingual_transformer.py +++ b/fairseq/models/multilingual_transformer.py @@ -8,17 +8,19 @@ from collections import OrderedDict from fairseq import utils -from fairseq.tasks.multilingual_translation import MultilingualTranslationTask - -from . import FairseqMultiModel, register_model, register_model_architecture - -from .transformer import ( +from fairseq.models import ( + FairseqMultiModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( base_architecture, Embedding, TransformerModel, TransformerEncoder, TransformerDecoder, ) +from fairseq.tasks.multilingual_translation import MultilingualTranslationTask @register_model('multilingual_transformer') diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py index 11cc7e051..8e9391d8e 100644 --- a/fairseq/models/transformer.py +++ b/fairseq/models/transformer.py @@ -12,14 +12,19 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import options, utils -from fairseq.modules import ( - AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm, - MultiheadAttention, PositionalEmbedding, SinusoidalPositionalEmbedding, +from fairseq.models import ( + FairseqEncoder, + FairseqIncrementalDecoder, + FairseqModel, + register_model, + register_model_architecture, ) - -from . import ( - FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel, - FairseqModel, register_model, register_model_architecture, +from fairseq.modules import ( + AdaptiveSoftmax, + LayerNorm, + MultiheadAttention, + PositionalEmbedding, + SinusoidalPositionalEmbedding, ) @@ -149,113 +154,6 @@ class TransformerModel(FairseqModel): return TransformerModel(encoder, decoder) -@register_model('transformer_lm') -class TransformerLanguageModel(FairseqLanguageModel): - def __init__(self, decoder): - super().__init__(decoder) - - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - # fmt: off - parser.add_argument('--dropout', default=0.1, type=float, metavar='D', - help='dropout probability') - parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', - help='dropout probability for attention weights') - parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', - help='dropout probability after ReLU in FFN') - parser.add_argument('--decoder-embed-dim', type=int, metavar='N', - help='decoder embedding dimension') - parser.add_argument('--decoder-output-dim', type=int, metavar='N', - help='decoder output dimension') - parser.add_argument('--decoder-input-dim', type=int, metavar='N', - help='decoder input dimension') - parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', - help='decoder embedding dimension for FFN') - parser.add_argument('--decoder-layers', type=int, metavar='N', - help='num decoder layers') - parser.add_argument('--decoder-attention-heads', type=int, metavar='N', - help='num decoder attention heads') - parser.add_argument('--decoder-normalize-before', default=False, action='store_true', - help='apply layernorm before each decoder block') - parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', - help='comma separated list of adaptive softmax cutoff points. ' - 'Must be used with adaptive_loss criterion') - parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', - help='sets adaptive softmax dropout for the tail projections') - parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', - help='adaptive input factor') - parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', - help='if set, disables positional embeddings (outside self attention)') - parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', - help='share decoder input and output embeddings') - parser.add_argument('--character-embeddings', default=False, action='store_true', - help='if set, uses character embedding convolutions to produce token embeddings') - parser.add_argument('--character-filters', type=str, metavar='LIST', - default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', - help='size of character embeddings') - parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, - help='size of character embeddings') - parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2, - help='number of highway layers for character token embeddder') - parser.add_argument('--adaptive-input', action='store_true', - help='if set, uses adaptive input') - parser.add_argument('--adaptive-input-factor', type=float, metavar='N', - help='adaptive input factor') - parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', - help='comma separated list of adaptive input cutoff points.') - parser.add_argument('--tie-adaptive-weights', action='store_true', - help='if set, ties the weights of adaptive softmax and adaptive input') - parser.add_argument('--tie-adaptive-proj', action='store_true', - help='if set, ties the projection weights of adaptive softmax and adaptive input') - parser.add_argument('--decoder-learned-pos', action='store_true', - help='use learned positional embeddings in the decoder') - # fmt: on - - @classmethod - def build_model(cls, args, task): - """Build a new model instance.""" - - # make sure all arguments are present in older models - base_lm_architecture(args) - - if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False: - # backward compatibility - args.tie_adaptive_proj = True - - if not hasattr(args, 'max_source_positions'): - args.max_source_positions = args.tokens_per_sample - if not hasattr(args, 'max_target_positions'): - args.max_target_positions = args.tokens_per_sample - - if args.character_embeddings: - embed_tokens = CharacterTokenEmbedder( - task.dictionary, eval(args.character_filters), - args.character_embedding_dim, args.decoder_embed_dim, - args.char_embedder_highway_layers, - ) - elif args.adaptive_input: - embed_tokens = AdaptiveInput( - len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, - args.adaptive_input_factor, args.decoder_embed_dim, - options.eval_str_list(args.adaptive_input_cutoff, type=int), - ) - else: - embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) - - if args.tie_adaptive_weights: - assert args.adaptive_input - assert args.adaptive_input_factor == args.adaptive_softmax_factor - assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( - args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) - assert args.decoder_input_dim == args.decoder_output_dim - - decoder = TransformerDecoder( - args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False, - ) - return TransformerLanguageModel(decoder) - - class TransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer @@ -804,67 +702,6 @@ def Linear(in_features, out_features, bias=True): return m -@register_model_architecture('transformer_lm', 'transformer_lm') -def base_lm_architecture(args): - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) - args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048) - args.decoder_layers = getattr(args, 'decoder_layers', 6) - args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) - args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) - args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4) - args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) - args.activation_fn = getattr(args, 'activation_fn', 'relu') - - args.add_bos_token = getattr(args, 'add_bos_token', False) - args.character_embeddings = getattr(args, 'character_embeddings', False) - - args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) - args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) - - # The model training is not stable without this - args.decoder_normalize_before = True - - args.adaptive_input = getattr(args, 'adaptive_input', False) - args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4) - args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None) - - args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False) - args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False) - - -@register_model_architecture('transformer_lm', 'transformer_lm_big') -def transformer_lm_big(args): - args.decoder_layers = getattr(args, 'decoder_layers', 12) - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024) - args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) - args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) - base_lm_architecture(args) - - -@register_model_architecture('transformer_lm', 'transformer_lm_wiki103') -def transformer_lm_wiki103(args): - args.decoder_layers = getattr(args, 'decoder_layers', 16) - args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) - args.dropout = getattr(args, 'dropout', 0.3) - args.adaptive_input = getattr(args, 'adaptive_input', True) - args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True) - args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000') - args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000') - args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2) - args.attention_dropout = getattr(args, 'attention_dropout', 0.1) - args.activation_dropout = getattr(args, 'activation_dropout', 0.1) - transformer_lm_big(args) - - -@register_model_architecture('transformer_lm', 'transformer_lm_gbw') -def transformer_lm_gbw(args): - args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) - args.dropout = getattr(args, 'dropout', 0.1) - args.attention_dropout = getattr(args, 'attention_dropout', 0.1) - transformer_lm_big(args) - - @register_model_architecture('transformer', 'transformer') def base_architecture(args): args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) diff --git a/fairseq/models/transformer_from_pretrained_xlm.py b/fairseq/models/transformer_from_pretrained_xlm.py index f151c3db2..7b85b7f59 100644 --- a/fairseq/models/transformer_from_pretrained_xlm.py +++ b/fairseq/models/transformer_from_pretrained_xlm.py @@ -10,6 +10,7 @@ from typing import Any, Dict from fairseq import checkpoint_utils from fairseq.data.masked_lm_dictionary import MaskedLMDictionary +from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, @@ -17,8 +18,6 @@ from fairseq.models.transformer import ( base_architecture as transformer_base_architecture, ) -from . import register_model, register_model_architecture - @register_model("transformer_from_pretrained_xlm") class TransformerFromPretrainedXLMModel(TransformerModel): diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py new file mode 100644 index 000000000..2264b8f1a --- /dev/null +++ b/fairseq/models/transformer_lm.py @@ -0,0 +1,241 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from fairseq import options +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + Embedding, + TransformerDecoder, +) +from fairseq.modules import ( + AdaptiveInput, + CharacterTokenEmbedder, +) + + +@register_model('transformer_lm') +class TransformerLanguageModel(FairseqLanguageModel): + def __init__(self, decoder): + super().__init__(decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--dropout', default=0.1, type=float, metavar='D', + help='dropout probability') + parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', + help='dropout probability for attention weights') + parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', + help='dropout probability after ReLU in FFN') + parser.add_argument('--decoder-embed-dim', type=int, metavar='N', + help='decoder embedding dimension') + parser.add_argument('--decoder-output-dim', type=int, metavar='N', + help='decoder output dimension') + parser.add_argument('--decoder-input-dim', type=int, metavar='N', + help='decoder input dimension') + parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', + help='decoder embedding dimension for FFN') + parser.add_argument('--decoder-layers', type=int, metavar='N', + help='num decoder layers') + parser.add_argument('--decoder-attention-heads', type=int, metavar='N', + help='num decoder attention heads') + parser.add_argument('--decoder-normalize-before', default=False, action='store_true', + help='apply layernorm before each decoder block') + parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', + help='comma separated list of adaptive softmax cutoff points. ' + 'Must be used with adaptive_loss criterion') + parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', + help='sets adaptive softmax dropout for the tail projections') + parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', + help='adaptive input factor') + parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', + help='if set, disables positional embeddings (outside self attention)') + parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', + help='share decoder input and output embeddings') + parser.add_argument('--character-embeddings', default=False, action='store_true', + help='if set, uses character embedding convolutions to produce token embeddings') + parser.add_argument('--character-filters', type=str, metavar='LIST', + default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', + help='size of character embeddings') + parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, + help='size of character embeddings') + parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2, + help='number of highway layers for character token embeddder') + parser.add_argument('--adaptive-input', action='store_true', + help='if set, uses adaptive input') + parser.add_argument('--adaptive-input-factor', type=float, metavar='N', + help='adaptive input factor') + parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', + help='comma separated list of adaptive input cutoff points.') + parser.add_argument('--tie-adaptive-weights', action='store_true', + help='if set, ties the weights of adaptive softmax and adaptive input') + parser.add_argument('--tie-adaptive-proj', action='store_true', + help='if set, ties the projection weights of adaptive softmax and adaptive input') + parser.add_argument('--decoder-learned-pos', action='store_true', + help='use learned positional embeddings in the decoder') + # fmt: on + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_lm_architecture(args) + + if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False: + # backward compatibility + args.tie_adaptive_proj = True + + if not hasattr(args, 'max_source_positions'): + args.max_source_positions = args.tokens_per_sample + if not hasattr(args, 'max_target_positions'): + args.max_target_positions = args.tokens_per_sample + + if args.character_embeddings: + embed_tokens = CharacterTokenEmbedder( + task.dictionary, eval(args.character_filters), + args.character_embedding_dim, args.decoder_embed_dim, + args.char_embedder_highway_layers, + ) + elif args.adaptive_input: + embed_tokens = AdaptiveInput( + len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, + args.adaptive_input_factor, args.decoder_embed_dim, + options.eval_str_list(args.adaptive_input_cutoff, type=int), + ) + else: + embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) + + if args.tie_adaptive_weights: + assert args.adaptive_input + assert args.adaptive_input_factor == args.adaptive_softmax_factor + assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( + args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) + assert args.decoder_input_dim == args.decoder_output_dim + + decoder = TransformerDecoder( + args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False, + ) + return TransformerLanguageModel(decoder) + + +@register_model_architecture('transformer_lm', 'transformer_lm') +def base_lm_architecture(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048) + args.decoder_layers = getattr(args, 'decoder_layers', 6) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) + args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) + args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4) + args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) + args.activation_fn = getattr(args, 'activation_fn', 'relu') + + args.add_bos_token = getattr(args, 'add_bos_token', False) + args.character_embeddings = getattr(args, 'character_embeddings', False) + + args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) + args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) + + # The model training is not stable without this + args.decoder_normalize_before = True + + args.adaptive_input = getattr(args, 'adaptive_input', False) + args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4) + args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None) + + args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False) + args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False) + + +@register_model_architecture('transformer_lm', 'transformer_lm_big') +def transformer_lm_big(args): + args.decoder_layers = getattr(args, 'decoder_layers', 12) + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) + base_lm_architecture(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_wiki103') +def transformer_lm_wiki103(args): + args.decoder_layers = getattr(args, 'decoder_layers', 16) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) + args.dropout = getattr(args, 'dropout', 0.3) + args.adaptive_input = getattr(args, 'adaptive_input', True) + args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True) + args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000') + args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000') + args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.activation_dropout = getattr(args, 'activation_dropout', 0.1) + transformer_lm_big(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_gbw') +def transformer_lm_gbw(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + transformer_lm_big(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_gpt') +def transformer_lm_gpt(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072) + args.decoder_layers = getattr(args, 'decoder_layers', 12) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.decoder_final_norm = getattr(args, 'decoder_final_norm', True) + args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast') + base_lm_architecture(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small') +def transformer_lm_gpt2_small(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) + args.decoder_layers = getattr(args, 'decoder_layers', 24) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.decoder_final_norm = getattr(args, 'decoder_final_norm', True) + args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast') + base_lm_architecture(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium') +def transformer_lm_gpt2_medium(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120) + args.decoder_layers = getattr(args, 'decoder_layers', 36) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 20) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.decoder_final_norm = getattr(args, 'decoder_final_norm', True) + args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast') + base_lm_architecture(args) + + +@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big') +def transformer_lm_gpt2_big(args): + args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600) + args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400) + args.decoder_layers = getattr(args, 'decoder_layers', 48) + args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 25) + args.dropout = getattr(args, 'dropout', 0.1) + args.attention_dropout = getattr(args, 'attention_dropout', 0.1) + args.decoder_final_norm = getattr(args, 'decoder_final_norm', True) + args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast') + base_lm_architecture(args) diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py index 6faf136a0..990ff80cf 100644 --- a/fairseq/modules/dynamic_convolution.py +++ b/fairseq/modules/dynamic_convolution.py @@ -150,7 +150,7 @@ class DynamicConv1dTBC(nn.Module): weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False) - output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 + output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 output = output.view(T, B, C) return output @@ -195,7 +195,7 @@ class DynamicConv1dTBC(nn.Module): # turn the convolution filters into band matrices weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False) weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight) - weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T + weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T output = torch.bmm(weight_expanded, x) output = output.transpose(0, 1).contiguous().view(T, B, C) diff --git a/fairseq/modules/highway.py b/fairseq/modules/highway.py index 9728ca17a..2fd3f6ace 100644 --- a/fairseq/modules/highway.py +++ b/fairseq/modules/highway.py @@ -6,7 +6,6 @@ # can be found in the PATENTS file in the same directory. import torch -import torch.nn.functional as F from torch import nn @@ -50,6 +49,6 @@ class Highway(torch.nn.Module): projection = layer(x) proj_x, gate = projection.chunk(2, dim=-1) proj_x = self.activation(proj_x) - gate = F.sigmoid(gate) + gate = torch.sigmoid(gate) x = gate * x + (gate.new_tensor([1]) - gate) * proj_x return x diff --git a/fairseq/modules/lightweight_convolution.py b/fairseq/modules/lightweight_convolution.py index a254a1605..644d80e60 100644 --- a/fairseq/modules/lightweight_convolution.py +++ b/fairseq/modules/lightweight_convolution.py @@ -5,14 +5,12 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -import math - import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils -from .unfold import unfold1d +from fairseq.modules.unfold import unfold1d class LightweightConv1d(nn.Module): @@ -182,7 +180,7 @@ class LightweightConv1dTBC(nn.Module): weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1) weight = F.dropout(weight, self.weight_dropout, training=self.training) - output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 + output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 output = output.view(T, B, C) return output diff --git a/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq/modules/transformer_sentence_encoder_layer.py index 9619b7f6d..4b6ce5863 100644 --- a/fairseq/modules/transformer_sentence_encoder_layer.py +++ b/fairseq/modules/transformer_sentence_encoder_layer.py @@ -5,12 +5,16 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -import math - import torch import torch.nn as nn import torch.nn.functional as F -from fairseq.modules import gelu, MultiheadAttention, BertLayerNorm, LayerNorm + +from fairseq.modules import ( + BertLayerNorm, + gelu, + LayerNorm, + MultiheadAttention, +) class TransformerSentenceEncoderLayer(nn.Module): diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py index 901eea860..db78193b0 100644 --- a/fairseq/optim/__init__.py +++ b/fairseq/optim/__init__.py @@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer +__all__ = [ + 'FairseqOptimizer', + 'FP16Optimizer', + 'MemoryEfficientFP16Optimizer', +] + + OPTIMIZER_REGISTRY = {} OPTIMIZER_CLASS_NAMES = set() diff --git a/fairseq/tasks/cross_lingual_lm.py b/fairseq/tasks/cross_lingual_lm.py index 20d1b23c0..c731ce25c 100644 --- a/fairseq/tasks/cross_lingual_lm.py +++ b/fairseq/tasks/cross_lingual_lm.py @@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask): lang2id[lang] = id return lang2id - @classmethod def load_dictionary(cls, filename): return MaskedLMDictionary.load(filename) diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py index fd28616b1..bdbc027cd 100644 --- a/fairseq/tasks/language_modeling.py +++ b/fairseq/tasks/language_modeling.py @@ -21,7 +21,7 @@ from fairseq.data import ( TruncatedDictionary, indexed_dataset ) -from . import FairseqTask, register_task +from fairseq.tasks import FairseqTask, register_task @register_task('language_modeling') diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py index 7be9be7e9..717c2d983 100644 --- a/fairseq/tasks/multilingual_translation.py +++ b/fairseq/tasks/multilingual_translation.py @@ -13,10 +13,8 @@ import torch from fairseq import options, utils from fairseq.data import ( - BacktranslationDataset, Dictionary, LanguagePairDataset, - NoisingDataset, RoundRobinZipDatasets, TransformEosLangPairDataset, indexed_dataset, diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py index 47a81687e..0116bf23b 100644 --- a/fairseq/tasks/translation.py +++ b/fairseq/tasks/translation.py @@ -12,9 +12,8 @@ from fairseq import options, utils from fairseq.data import ( ConcatDataset, data_utils, - Dictionary, + indexed_dataset, LanguagePairDataset, - indexed_dataset ) from . import FairseqTask, register_task @@ -26,8 +25,8 @@ class TranslationTask(FairseqTask): Translate from one (source) language to another (target) language. Args: - src_dict (Dictionary): dictionary for the source language - tgt_dict (Dictionary): dictionary for the target language + src_dict (~fairseq.data.Dictionary): dictionary for the source language + tgt_dict (~fairseq.data.Dictionary): dictionary for the target language .. note:: diff --git a/fairseq/tasks/translation_moe.py b/fairseq/tasks/translation_moe.py index 3a0ea7bf8..320e418c0 100644 --- a/fairseq/tasks/translation_moe.py +++ b/fairseq/tasks/translation_moe.py @@ -9,8 +9,8 @@ import contextlib import torch from fairseq import modules, utils -from . import register_task -from .translation import TranslationTask +from fairseq.tasks import register_task +from fairseq.tasks.translation import TranslationTask @contextlib.contextmanager diff --git a/interactive.py b/interactive.py index f66a23000..87dd290d7 100644 --- a/interactive.py +++ b/interactive.py @@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly. from collections import namedtuple import fileinput -import sys import torch from fairseq import checkpoint_utils, options, tasks, utils -from fairseq.sequence_generator import SequenceGenerator + Batch = namedtuple('Batch', 'ids src_tokens src_lengths') Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') diff --git a/train.py b/train.py index e16fdcc52..af852783b 100644 --- a/train.py +++ b/train.py @@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs. """ import collections -import itertools import math import os import random @@ -140,7 +139,7 @@ def train(args, trainer, task, epoch_itr): """Train the model for one epoch.""" # Update parameters every N batches update_freq = args.update_freq[epoch_itr.epoch - 1] \ - if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1] + if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr(