diff --git a/docs/tutorial_classifying_names.rst b/docs/tutorial_classifying_names.rst
index c20bf487d..b420d850b 100644
--- a/docs/tutorial_classifying_names.rst
+++ b/docs/tutorial_classifying_names.rst
@@ -285,7 +285,7 @@ following contents::
               max_source_positions=self.args.max_positions,
               max_target_positions=1,
               # Since our target is a single class label, there's no need for
-              # input feeding. If we set this to ``True`` then our Model's
+              # teacher forcing. If we set this to ``True`` then our Model's
               # ``forward()`` method would receive an additional argument called
               # *prev_output_tokens* that would contain a shifted version of the
               # target sequence.
diff --git a/docs/tutorial_simple_lstm.rst b/docs/tutorial_simple_lstm.rst
index 57a254962..30bdc7213 100644
--- a/docs/tutorial_simple_lstm.rst
+++ b/docs/tutorial_simple_lstm.rst
@@ -125,9 +125,9 @@ Decoder
 
 Our Decoder will predict the next word, conditioned on the Encoder's final
 hidden state and an embedded representation of the previous target word -- which
-is sometimes called *input feeding* or *teacher forcing*. More specifically,
-we'll use a :class:`torch.nn.LSTM` to produce a sequence of hidden states that
-we'll project to the size of the output vocabulary to predict each target word.
+is sometimes called *teacher forcing*. More specifically, we'll use a
+:class:`torch.nn.LSTM` to produce a sequence of hidden states that we'll project
+to the size of the output vocabulary to predict each target word.
 
 ::
 
@@ -171,7 +171,7 @@ we'll project to the size of the output vocabulary to predict each target word.
           """
           Args:
               prev_output_tokens (LongTensor): previous decoder outputs of shape
-                  `(batch, tgt_len)`, for input feeding/teacher forcing
+                  `(batch, tgt_len)`, for teacher forcing
               encoder_out (Tensor, optional): output from the encoder, used for
                   encoder-side attention
 
@@ -387,8 +387,8 @@ previous hidden states.
 
 In fairseq this is called :ref:`Incremental decoding`. Incremental decoding is a
 special mode at inference time where the Model only receives a single timestep
-of input corresponding to the immediately previous output token (for input
-feeding) and must produce the next output incrementally. Thus the model must
+of input corresponding to the immediately previous output token (for teacher
+forcing) and must produce the next output incrementally. Thus the model must
 cache any long-term state that is needed about the sequence, e.g., hidden
 states, convolutional states, etc.
 
diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
index 351ba1e8f..64a5e4c7e 100644
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
@@ -88,8 +88,7 @@ class LanguagePairDataset(FairseqDataset):
         shuffle (bool, optional): shuffle dataset elements before batching
             (default: True).
         input_feeding (bool, optional): create a shifted version of the targets
-            to be passed into the model for input feeding/teacher forcing
-            (default: True).
+            to be passed into the model for teacher forcing (default: True).
         remove_eos_from_source (bool, optional): if set, removes eos from end
             of source if it's present (default: False).
         append_eos_to_target (bool, optional): if set, appends eos to end of
@@ -167,10 +166,10 @@ class LanguagePairDataset(FairseqDataset):
                   - `src_lengths` (LongTensor): 1D Tensor of the unpadded
                     lengths of each source sentence of shape `(bsz)`
                   - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
-                    tokens in the target sentence, shifted right by one position
-                    for input feeding/teacher forcing, of shape `(bsz,
-                    tgt_len)`. This key will not be present if *input_feeding*
-                    is ``False``. Padding will appear on the left if
+                    tokens in the target sentence, shifted right by one
+                    position for teacher forcing, of shape `(bsz, tgt_len)`.
+                    This key will not be present if *input_feeding* is
+                    ``False``.  Padding will appear on the left if
                     *left_pad_target* is ``True``.
 
                 - `target` (LongTensor): a padded 2D Tensor of tokens in the
diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py
index 732e66a09..2e5398e36 100644
--- a/fairseq/models/fairseq_decoder.py
+++ b/fairseq/models/fairseq_decoder.py
@@ -22,7 +22,7 @@ class FairseqDecoder(nn.Module):
         """
         Args:
             prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (dict, optional): output from the encoder, used for
                 encoder-side attention
 
diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py
index ede1b5173..1c4121557 100644
--- a/fairseq/models/fairseq_incremental_decoder.py
+++ b/fairseq/models/fairseq_incremental_decoder.py
@@ -13,7 +13,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
 
     Incremental decoding is a special mode at inference time where the Model
     only receives a single timestep of input corresponding to the previous
-    output token (for input feeding) and must produce the next output
+    output token (for teacher forcing) and must produce the next output
     *incrementally*. Thus the model must cache any long-term state that is
     needed about the sequence, e.g., hidden states, convolutional states, etc.
 
@@ -37,7 +37,7 @@ class FairseqIncrementalDecoder(FairseqDecoder):
         """
         Args:
             prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (dict, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict, optional): dictionary used for storing
diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py
index 78b130861..f8bd5ba60 100644
--- a/fairseq/models/fairseq_model.py
+++ b/fairseq/models/fairseq_model.py
@@ -202,8 +202,8 @@ class FairseqEncoderDecoderModel(BaseFairseqModel):
         Run the forward pass for an encoder-decoder model.
 
         First feed a batch of source tokens through the encoder. Then, feed the
-        encoder output and previous decoder outputs (i.e., input feeding/teacher
-        forcing) to the decoder to produce the next outputs::
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
 
             encoder_out = self.encoder(src_tokens, src_lengths)
             return self.decoder(prev_output_tokens, encoder_out)
@@ -213,7 +213,7 @@ class FairseqEncoderDecoderModel(BaseFairseqModel):
                 `(batch, src_len)`
             src_lengths (LongTensor): source sentence lengths of shape `(batch)`
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
 
         Returns:
             tuple:
diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py
index 5b38ac5e7..0dc71a1f7 100644
--- a/fairseq/models/lightconv.py
+++ b/fairseq/models/lightconv.py
@@ -345,7 +345,7 @@ class LightConvDecoder(FairseqIncrementalDecoder):
         """
         Args:
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (Tensor, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict): dictionary used for storing state during
diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
index 59e14a4e7..591a48606 100644
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -370,7 +370,7 @@ class TransformerDecoder(FairseqIncrementalDecoder):
         """
         Args:
             prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for input feeding/teacher forcing
+                `(batch, tgt_len)`, for teacher forcing
             encoder_out (Tensor, optional): output from the encoder, used for
                 encoder-side attention
             incremental_state (dict): dictionary used for storing state during