Fixes (#442)

Summary: minor fixes: 1- adding fairseq logo 2- encoder padding for fconv self att 3- legacy ddp change Pull Request resolved: https://github.com/pytorch/fairseq/pull/442 Differential Revision: D13651715 Pulled By: myleott fbshipit-source-id: ac93c80f1dbffdfe03fbd4b8a8ea527aecb576a7
2024-09-11 17:25:31 +03:00 · 2019-01-14 08:56:20 -08:00 · 2019-01-14 08:56:20 -08:00 · d9284ee7ea
commit d9284ee7ea
parent 315fa5cbd9
4 changed files with 19 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# Introduction
+# Introduction <img src="fairseq_logo.png" width="50"> 

 Fairseq(-py) is a sequence modeling toolkit that allows researchers and
 developers to train custom models for translation, summarization, language
--- a/fairseq/legacy_distributed_data_parallel.py
+++ b/fairseq/legacy_distributed_data_parallel.py
@ -141,6 +141,8 @@ class LegacyDistributedDataParallel(nn.Module):
            for param in self.module.parameters():
                if not param.requires_grad:
                    continue
+                if param.grad is None:
+                    param.grad = torch.zeros_like(param)
                if param.grad.requires_grad:
                    raise RuntimeError("DistributedDataParallel only works "
                                       "with gradients that don't require "
--- a/fairseq/models/fconv_self_att.py
+++ b/fairseq/models/fconv_self_att.py
@ -195,6 +195,10 @@ class FConvEncoder(FairseqEncoder):
        # project to size of convolution
        x = self.fc1(x)

+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

@ -202,6 +206,9 @@ class FConvEncoder(FairseqEncoder):
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
@ -218,6 +225,10 @@ class FConvEncoder(FairseqEncoder):
        # project back to size of embedding
        x = self.fc2(x)

+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

@ -226,6 +237,7 @@ class FConvEncoder(FairseqEncoder):

        return {
            'encoder_out': (x, y),
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }

    def reorder_encoder_out(self, encoder_out, new_order):
@ -233,6 +245,10 @@ class FConvEncoder(FairseqEncoder):
            eo.index_select(0, new_order) for eo in encoder_out['encoder_out']
        )

+        encoder_out['encoder_padding_mask'] = tuple(
+            eo.index_select(0, new_order) for eo in encoder_out['encoder_padding_mask']
+        )
+
        if 'pretrained' in encoder_out:
            encoder_out['pretrained']['encoder_out'] = tuple(
                eo.index_select(0, new_order)
--- a/fairseq_logo.png
+++ b/fairseq_logo.png