Cleanup LM + Flake8

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/720

Differential Revision: D15259091

Pulled By: myleott

fbshipit-source-id: 06a35996c06ccddb49fdc9e01e348ff3c9da334e
This commit is contained in:
Myle Ott 2019-05-08 08:57:56 -07:00 committed by Facebook Github Bot
parent eddcdf08e1
commit f2563c21e1
34 changed files with 649 additions and 501 deletions

View File

@ -47,4 +47,5 @@ __all__ = [
'TokenBlockDataset',
'TransformEosDataset',
'TransformEosLangPairDataset',
'TruncatedDictionary',
]

View File

@ -10,6 +10,7 @@ import os
import numpy as np
from collections import Iterable
def infer_language_pair(path):
"""Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
src, dst = None, None
@ -182,7 +183,7 @@ def batch_by_size(
def process_bpe_symbol(sentence: str, bpe_symbol: str):
if bpe_symbol == 'sentencepiece':
sentence = sentence.replace(' ','').replace('\u2581', ' ').strip()
sentence = sentence.replace(' ', '').replace('\u2581', ' ').strip()
elif bpe_symbol is not None:
sentence = (sentence + ' ').replace(bpe_symbol, '').rstrip()
return sentence

View File

@ -18,6 +18,7 @@ from fairseq.data import data_utils
class Dictionary(object):
"""A mapping from symbols to consecutive integers"""
def __init__(self, pad='<pad>', eos='</s>', unk='<unk>', bos='<s>'):
self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
self.symbols = []
@ -282,6 +283,7 @@ class Dictionary(object):
else:
merge_result(Dictionary._add_file_to_dictionary_single_worker(filename, tokenize, dict.eos_word))
class TruncatedDictionary(object):
def __init__(self, wrapped_dict, length):

View File

@ -7,8 +7,6 @@
import itertools
import math
import queue
import threading
import numpy as np
import torch

View File

@ -8,8 +8,6 @@
import numpy as np
import torch
from fairseq import utils
from . import data_utils, FairseqDataset

View File

@ -10,7 +10,7 @@ import math
import numpy as np
import torch
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple
from . import FairseqDataset, data_utils

View File

@ -9,19 +9,33 @@ import argparse
import importlib
import os
from .fairseq_decoder import FairseqDecoder # noqa: F401
from .fairseq_encoder import FairseqEncoder # noqa: F401
from .fairseq_incremental_decoder import FairseqIncrementalDecoder # noqa: F401
from .fairseq_decoder import FairseqDecoder
from .fairseq_encoder import FairseqEncoder
from .fairseq_incremental_decoder import FairseqIncrementalDecoder
from .fairseq_model import (
BaseFairseqModel,
FairseqModel, # noqa: F401
FairseqMultiModel, # noqa: F401
FairseqLanguageModel, # noqa: F401
FairseqEncoderModel, # noqa: F401
FairseqModel,
FairseqMultiModel,
FairseqLanguageModel,
FairseqEncoderModel,
)
from .composite_encoder import CompositeEncoder # noqa: F401
from .distributed_fairseq_model import DistributedFairseqModel # noqa: F401
from .composite_encoder import CompositeEncoder
from .distributed_fairseq_model import DistributedFairseqModel
__all__ = [
'BaseFairseqModel',
'CompositeEncoder',
'DistributedFairseqModel',
'FairseqDecoder',
'FairseqEncoder',
'FairseqEncoderModel',
'FairseqIncrementalDecoder',
'FairseqLanguageModel',
'FairseqModel',
'FairseqMultiModel',
]
MODEL_REGISTRY = {}

View File

@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqEncoder
from fairseq.models import FairseqEncoder
class CompositeEncoder(FairseqEncoder):

View File

@ -6,14 +6,11 @@
# can be found in the PATENTS file in the same directory.
import inspect
import socket
from torch.nn import parallel
from fairseq import distributed_utils
from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
from . import BaseFairseqModel
from fairseq.models import BaseFairseqModel
def DistributedFairseqModel(args, model):

View File

@ -5,7 +5,7 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqDecoder
from fairseq.models import FairseqDecoder
class FairseqIncrementalDecoder(FairseqDecoder):
@ -25,7 +25,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
The :class:`FairseqIncrementalDecoder` interface also defines the
:func:`reorder_incremental_state` method, which is used during beam search
to select and reorder the incremental state based on the selection of beams.
To learn more about how incremental decoding works, refer to `this blog
<http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
"""

View File

@ -4,14 +4,15 @@
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from typing import Dict, List, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from . import FairseqDecoder, FairseqEncoder
from fairseq.data import Dictionary
from fairseq.models import FairseqDecoder, FairseqEncoder
class BaseFairseqModel(nn.Module):

View File

@ -10,17 +10,19 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq import utils
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from fairseq.modules import (
AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding,
LinearizedConvolution,
)
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel,
FairseqLanguageModel, register_model, register_model_architecture,
)
@register_model('fconv')
class FConvModel(FairseqModel):
@ -111,58 +113,6 @@ class FConvModel(FairseqModel):
return FConvModel(encoder, decoder)
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
class FConvEncoder(FairseqEncoder):
"""
Convolutional encoder consisting of `len(convolutions)` layers.
@ -643,46 +593,6 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
return nn.utils.weight_norm(m, dim=2)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv', 'fconv')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)

106
fairseq/models/fconv_lm.py Normal file
View File

@ -0,0 +1,106 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.fconv import FConvDecoder
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions') and not hasattr(args, 'tokens_per_sample'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
adaptive_softmax_dropout=args.adaptive_softmax_dropout,
)
return FConvLanguageModel(decoder)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)

View File

@ -12,14 +12,20 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import checkpoint_utils
from fairseq.modules import (
DownsampledMultiHeadAttention, GradMultiply, LayerNorm,
LearnedPositionalEmbedding, LinearizedConvolution,
from fairseq.models import (
CompositeEncoder,
FairseqDecoder,
FairseqEncoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel,
register_model, register_model_architecture,
from fairseq.modules import (
DownsampledMultiHeadAttention,
GradMultiply,
LayerNorm,
LearnedPositionalEmbedding,
LinearizedConvolution,
)

View File

@ -12,15 +12,21 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm,
LearnedPositionalEmbedding, MultiheadAttention, SinusoidalPositionalEmbedding,
DynamicConv1dTBC, LightweightConv1dTBC,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel,
FairseqModel, register_model, register_model_architecture,
from fairseq.modules import (
AdaptiveSoftmax,
DynamicConv1dTBC,
LayerNorm,
LearnedPositionalEmbedding,
LightweightConv1dTBC,
MultiheadAttention,
SinusoidalPositionalEmbedding,
)
@ -171,117 +177,6 @@ class LightConvModel(FairseqModel):
return LightConvModel(encoder, decoder)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
class LightConvEncoder(FairseqEncoder):
"""
LightConv encoder consisting of *args.encoder_layers* layers. Each layer
@ -786,47 +681,6 @@ def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=Fals
return m
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('lightconv', 'lightconv')
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)

View File

@ -0,0 +1,173 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.lightconv import (
Embedding,
LightConvDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('lightconv_lm')
class LightConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--input-dropout', type=float, metavar='D',
help='dropout probability of the inputs')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads or LightConv/DynamicConv heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', default=False, action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
"""LightConv and DynamicConv arguments"""
parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
help='list of kernel size (default: "[3,7,15,31,31,31]")')
parser.add_argument('--decoder-glu', type=options.eval_bool,
help='glu after in proj')
parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
choices=['dynamic', 'lightweight'],
help='type of convolution')
parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
parser.add_argument('--weight-dropout', type=float, metavar='D',
help='dropout probability for conv weights')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters),
args.character_embedding_dim,
args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int))
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False)
return LightConvLanguageModel(decoder)
@register_model_architecture('lightconv_lm', 'lightconv_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31])
if len(args.decoder_kernel_size_list) == 1:
args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers
@register_model_architecture('lightconv_lm', 'lightconv_lm_gbw')
def lightconv_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)

View File

@ -10,11 +10,14 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import AdaptiveSoftmax
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from fairseq.modules import AdaptiveSoftmax
@register_model('lstm')
@ -299,7 +302,7 @@ class AttentionLayer(nn.Module):
# sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
x = F.tanh(self.output_proj(torch.cat((x, input), dim=1)))
x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
return x, attn_scores

View File

@ -9,8 +9,11 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from . import (
BaseFairseqModel, FairseqEncoder, register_model, register_model_architecture,
from fairseq.models import (
BaseFairseqModel,
FairseqEncoder,
register_model,
register_model_architecture,
)
from fairseq.modules import (
SinusoidalPositionalEmbedding,

View File

@ -8,17 +8,19 @@
from collections import OrderedDict
from fairseq import utils
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
from . import FairseqMultiModel, register_model, register_model_architecture
from .transformer import (
from fairseq.models import (
FairseqMultiModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
base_architecture,
Embedding,
TransformerModel,
TransformerEncoder,
TransformerDecoder,
)
from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
@register_model('multilingual_transformer')

View File

@ -12,14 +12,19 @@ import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import (
AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LayerNorm,
MultiheadAttention, PositionalEmbedding, SinusoidalPositionalEmbedding,
from fairseq.models import (
FairseqEncoder,
FairseqIncrementalDecoder,
FairseqModel,
register_model,
register_model_architecture,
)
from . import (
FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel,
FairseqModel, register_model, register_model_architecture,
from fairseq.modules import (
AdaptiveSoftmax,
LayerNorm,
MultiheadAttention,
PositionalEmbedding,
SinusoidalPositionalEmbedding,
)
@ -149,113 +154,6 @@ class TransformerModel(FairseqModel):
return TransformerModel(encoder, decoder)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
class TransformerEncoder(FairseqEncoder):
"""
Transformer encoder consisting of *args.encoder_layers* layers. Each layer
@ -804,67 +702,6 @@ def Linear(in_features, out_features, bias=True):
return m
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer', 'transformer')
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)

View File

@ -10,6 +10,7 @@ from typing import Any, Dict
from fairseq import checkpoint_utils
from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
from fairseq.models import register_model, register_model_architecture
from fairseq.models.transformer import (
TransformerDecoder,
TransformerEncoder,
@ -17,8 +18,6 @@ from fairseq.models.transformer import (
base_architecture as transformer_base_architecture,
)
from . import register_model, register_model_architecture
@register_model("transformer_from_pretrained_xlm")
class TransformerFromPretrainedXLMModel(TransformerModel):

View File

@ -0,0 +1,241 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import options
from fairseq.models import (
FairseqLanguageModel,
register_model,
register_model_architecture,
)
from fairseq.models.transformer import (
Embedding,
TransformerDecoder,
)
from fairseq.modules import (
AdaptiveInput,
CharacterTokenEmbedder,
)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension')
parser.add_argument('--decoder-input-dim', type=int, metavar='N',
help='decoder input dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--character-embeddings', default=False, action='store_true',
help='if set, uses character embedding convolutions to produce token embeddings')
parser.add_argument('--character-filters', type=str, metavar='LIST',
default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
help='size of character embeddings')
parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4,
help='size of character embeddings')
parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2,
help='number of highway layers for character token embeddder')
parser.add_argument('--adaptive-input', action='store_true',
help='if set, uses adaptive input')
parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
help='adaptive input factor')
parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
help='comma separated list of adaptive input cutoff points.')
parser.add_argument('--tie-adaptive-weights', action='store_true',
help='if set, ties the weights of adaptive softmax and adaptive input')
parser.add_argument('--tie-adaptive-proj', action='store_true',
help='if set, ties the projection weights of adaptive softmax and adaptive input')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'no_tie_adaptive_proj') and args.no_tie_adaptive_proj is False:
# backward compatibility
args.tie_adaptive_proj = True
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
if args.character_embeddings:
embed_tokens = CharacterTokenEmbedder(
task.dictionary, eval(args.character_filters),
args.character_embedding_dim, args.decoder_embed_dim,
args.char_embedder_highway_layers,
)
elif args.adaptive_input:
embed_tokens = AdaptiveInput(
len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim,
args.adaptive_input_factor, args.decoder_embed_dim,
options.eval_str_list(args.adaptive_input_cutoff, type=int),
)
else:
embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad())
if args.tie_adaptive_weights:
assert args.adaptive_input
assert args.adaptive_input_factor == args.adaptive_softmax_factor
assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
assert args.decoder_input_dim == args.decoder_output_dim
decoder = TransformerDecoder(
args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False,
)
return TransformerLanguageModel(decoder)
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.activation_fn = getattr(args, 'activation_fn', 'relu')
args.add_bos_token = getattr(args, 'add_bos_token', False)
args.character_embeddings = getattr(args, 'character_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)
# The model training is not stable without this
args.decoder_normalize_before = True
args.adaptive_input = getattr(args, 'adaptive_input', False)
args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False)
args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False)
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.decoder_layers = getattr(args, 'decoder_layers', 16)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.dropout = getattr(args, 'dropout', 0.3)
args.adaptive_input = getattr(args, 'adaptive_input', True)
args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', True)
args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', '20000,60000')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '20000,60000')
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0.2)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.activation_dropout = getattr(args, 'activation_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
def transformer_lm_gpt(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)
args.decoder_layers = getattr(args, 'decoder_layers', 12)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 12)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
def transformer_lm_gpt2_small(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_layers = getattr(args, 'decoder_layers', 24)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
def transformer_lm_gpt2_medium(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)
args.decoder_layers = getattr(args, 'decoder_layers', 36)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 20)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
def transformer_lm_gpt2_big(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)
args.decoder_layers = getattr(args, 'decoder_layers', 48)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 25)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.decoder_final_norm = getattr(args, 'decoder_final_norm', True)
args.activation_fn = getattr(args, 'activation_fn', 'gelu_fast')
base_lm_architecture(args)

View File

@ -150,7 +150,7 @@ class DynamicConv1dTBC(nn.Module):
weight = F.dropout(weight, self.weight_dropout, training=self.training, inplace=False)
output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1
output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1
output = output.view(T, B, C)
return output
@ -195,7 +195,7 @@ class DynamicConv1dTBC(nn.Module):
# turn the convolution filters into band matrices
weight_expanded = weight.new_zeros(B*H, T, T+K-1, requires_grad=False)
weight_expanded.as_strided((B*H, T, K), (T*(T+K-1), T+K, 1)).copy_(weight)
weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
output = torch.bmm(weight_expanded, x)
output = output.transpose(0, 1).contiguous().view(T, B, C)

View File

@ -6,7 +6,6 @@
# can be found in the PATENTS file in the same directory.
import torch
import torch.nn.functional as F
from torch import nn
@ -50,6 +49,6 @@ class Highway(torch.nn.Module):
projection = layer(x)
proj_x, gate = projection.chunk(2, dim=-1)
proj_x = self.activation(proj_x)
gate = F.sigmoid(gate)
gate = torch.sigmoid(gate)
x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
return x

View File

@ -5,14 +5,12 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import utils
from .unfold import unfold1d
from fairseq.modules.unfold import unfold1d
class LightweightConv1d(nn.Module):
@ -182,7 +180,7 @@ class LightweightConv1dTBC(nn.Module):
weight = weight.view(1, H, K).expand(T*B, H, K).contiguous().view(T*B*H, K, 1)
weight = F.dropout(weight, self.weight_dropout, training=self.training)
output = torch.bmm(x_unfold, weight) # T*B*H x R x 1
output = torch.bmm(x_unfold, weight) # T*B*H x R x 1
output = output.view(T, B, C)
return output

View File

@ -5,12 +5,16 @@
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq.modules import gelu, MultiheadAttention, BertLayerNorm, LayerNorm
from fairseq.modules import (
BertLayerNorm,
gelu,
LayerNorm,
MultiheadAttention,
)
class TransformerSentenceEncoderLayer(nn.Module):

View File

@ -12,6 +12,13 @@ from .fairseq_optimizer import FairseqOptimizer
from .fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
__all__ = [
'FairseqOptimizer',
'FP16Optimizer',
'MemoryEfficientFP16Optimizer',
]
OPTIMIZER_REGISTRY = {}
OPTIMIZER_CLASS_NAMES = set()

View File

@ -78,7 +78,6 @@ class CrossLingualLMTask(FairseqTask):
lang2id[lang] = id
return lang2id
@classmethod
def load_dictionary(cls, filename):
return MaskedLMDictionary.load(filename)

View File

@ -21,7 +21,7 @@ from fairseq.data import (
TruncatedDictionary,
indexed_dataset
)
from . import FairseqTask, register_task
from fairseq.tasks import FairseqTask, register_task
@register_task('language_modeling')

View File

@ -13,10 +13,8 @@ import torch
from fairseq import options, utils
from fairseq.data import (
BacktranslationDataset,
Dictionary,
LanguagePairDataset,
NoisingDataset,
RoundRobinZipDatasets,
TransformEosLangPairDataset,
indexed_dataset,

View File

@ -12,9 +12,8 @@ from fairseq import options, utils
from fairseq.data import (
ConcatDataset,
data_utils,
Dictionary,
indexed_dataset,
LanguagePairDataset,
indexed_dataset
)
from . import FairseqTask, register_task
@ -26,8 +25,8 @@ class TranslationTask(FairseqTask):
Translate from one (source) language to another (target) language.
Args:
src_dict (Dictionary): dictionary for the source language
tgt_dict (Dictionary): dictionary for the target language
src_dict (~fairseq.data.Dictionary): dictionary for the source language
tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
.. note::

View File

@ -9,8 +9,8 @@ import contextlib
import torch
from fairseq import modules, utils
from . import register_task
from .translation import TranslationTask
from fairseq.tasks import register_task
from fairseq.tasks.translation import TranslationTask
@contextlib.contextmanager

View File

@ -11,12 +11,11 @@ Translate raw text with a trained model. Batches data on-the-fly.
from collections import namedtuple
import fileinput
import sys
import torch
from fairseq import checkpoint_utils, options, tasks, utils
from fairseq.sequence_generator import SequenceGenerator
Batch = namedtuple('Batch', 'ids src_tokens src_lengths')
Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments')

View File

@ -10,7 +10,6 @@ Train a new model on one or across multiple GPUs.
"""
import collections
import itertools
import math
import os
import random
@ -140,7 +139,7 @@ def train(args, trainer, task, epoch_itr):
"""Train the model for one epoch."""
# Update parameters every N batches
update_freq = args.update_freq[epoch_itr.epoch - 1] \
if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1]
if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1]
# Initialize data iterator
itr = epoch_itr.next_epoch_itr(