From 0790c0cfc3038dcd1890262f75c8a01f47d5fc20 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Thu, 9 Jan 2020 13:26:21 -0800
Subject: [PATCH] Doc improvements

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/1606

Differential Revision: D19330727

Pulled By: myleott

fbshipit-source-id: dc6d100e42566efbc2ebc955689878ed8a820861
---
 examples/language_model/README.md | 19 +++++++++++++++++--
 fairseq/options.py                |  5 +++++
 fairseq/trainer.py                |  3 ++-
 train.py                          |  2 +-
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index 66c5cb8e9..43f3381a1 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -96,13 +96,28 @@ batch) or `--tokens-per-sample` (max sequence length). You can also adjust
 number of GPUs.
 
 ### 3) Evaluate
+
 ```bash
 fairseq-eval-lm data-bin/wikitext-103 \
     --path checkpoints/transformer_wiki103/checkpoint_best.pt \
-    --sample-break-mode complete --max-tokens 3072 \
-    --context-window 2560 --softmax-batch 1024
+    --max-sentences 2 \
+    --tokens-per-sample 512 \
+    --context-window 400
+# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
+# | Loss: 3.4164, Perplexity: 30.46
 ```
 
+*Note:* The `--context-window` option controls how much context is provided to
+each token when computing perplexity. When the window size is 0, the dataset is
+chunked into segments of length 512 and perplexity is computed over each segment
+normally. However, this results in worse (higher) perplexity since tokens that
+appear earlier in each segment have less conditioning. When the maximum window
+size is used (511 in this case), then we compute perplexity for each token
+fully conditioned on 511 tokens of context. This slows down evaluation
+significantly, since we must run a separate forward pass for every token in the
+dataset, but results in better (lower) perplexity.
+
+
 ## Convolutional language models
 
 Please see the [convolutional LM README](conv_lm/README.md) for instructions to
diff --git a/fairseq/options.py b/fairseq/options.py
index 468a27af7..005d45d75 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -327,6 +327,11 @@ def add_distributed_training_args(parser):
                        help='which GPU to use (usually configured automatically)')
     group.add_argument('--distributed-no-spawn', action='store_true',
                        help='do not spawn multiple processes even if multiple GPUs are visible')
+    # "c10d" is PyTorch's DDP implementation and provides the fastest
+    # training. "no_c10d" is a more robust, but slightly slower DDP
+    # implementation. Try this if you get warning messages about
+    # inconsistent gradients between workers, or if some of your model
+    # parameters are not always used.
     group.add_argument('--ddp-backend', default='c10d', type=str,
                        choices=['c10d', 'no_c10d'],
                        help='DistributedDataParallel backend')
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 4b2112343..c371c8c70 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -402,7 +402,8 @@ class Trainer(object):
                 ):
                     raise RuntimeError(
                         "Fatal error: gradients are inconsistent between workers. "
-                        "Try --ddp-backend=no_c10d."
+                        "Try --ddp-backend=no_c10d, which is a more robust but "
+                        "slightly slower DDP implementation."
                     )
 
         self.meters["oom"].update(ooms, len(samples))
diff --git a/train.py b/train.py
index e1de0b072..e91a171ea 100644
--- a/train.py
+++ b/train.py
@@ -348,7 +348,7 @@ def cli_main():
         args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
         args.distributed_rank = None  # set based on device id
         if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
-            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
+            print('| NOTE: you may get faster training with: --ddp-backend=no_c10d')
         torch.multiprocessing.spawn(
             fn=distributed_main,
             args=(args, ),