Summary:
minor fixes:
1- adding fairseq logo
2- encoder padding for fconv self att
3- legacy ddp change
Pull Request resolved: https://github.com/pytorch/fairseq/pull/442

Differential Revision: D13651715

Pulled By: myleott

fbshipit-source-id: ac93c80f1dbffdfe03fbd4b8a8ea527aecb576a7
This commit is contained in:
Huihui Fan 2019-01-14 08:56:20 -08:00 committed by Facebook Github Bot
parent 315fa5cbd9
commit d9284ee7ea
4 changed files with 19 additions and 1 deletions

View File

@ -1,4 +1,4 @@
# Introduction
# Introduction <img src="fairseq_logo.png" width="50">
Fairseq(-py) is a sequence modeling toolkit that allows researchers and
developers to train custom models for translation, summarization, language

View File

@ -141,6 +141,8 @@ class LegacyDistributedDataParallel(nn.Module):
for param in self.module.parameters():
if not param.requires_grad:
continue
if param.grad is None:
param.grad = torch.zeros_like(param)
if param.grad.requires_grad:
raise RuntimeError("DistributedDataParallel only works "
"with gradients that don't require "

View File

@ -195,6 +195,10 @@ class FConvEncoder(FairseqEncoder):
# project to size of convolution
x = self.fc1(x)
encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B
if not encoder_padding_mask.any():
encoder_padding_mask = None
# B x T x C -> T x B x C
x = x.transpose(0, 1)
@ -202,6 +206,9 @@ class FConvEncoder(FairseqEncoder):
for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
residual = x if proj is None else proj(x)
if encoder_padding_mask is not None:
x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
x = F.dropout(x, p=self.dropout, training=self.training)
padding_l = (conv.kernel_size[0] - 1) // 2
padding_r = conv.kernel_size[0] // 2
@ -218,6 +225,10 @@ class FConvEncoder(FairseqEncoder):
# project back to size of embedding
x = self.fc2(x)
if encoder_padding_mask is not None:
encoder_padding_mask = encoder_padding_mask.t() # -> B x T
x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
# scale gradients (this only affects backward, not forward)
x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
@ -226,6 +237,7 @@ class FConvEncoder(FairseqEncoder):
return {
'encoder_out': (x, y),
'encoder_padding_mask': encoder_padding_mask, # B x T
}
def reorder_encoder_out(self, encoder_out, new_order):
@ -233,6 +245,10 @@ class FConvEncoder(FairseqEncoder):
eo.index_select(0, new_order) for eo in encoder_out['encoder_out']
)
encoder_out['encoder_padding_mask'] = tuple(
eo.index_select(0, new_order) for eo in encoder_out['encoder_padding_mask']
)
if 'pretrained' in encoder_out:
encoder_out['pretrained']['encoder_out'] = tuple(
eo.index_select(0, new_order)

BIN
fairseq_logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB