diff --git a/docs/tutorial_classifying_names.rst b/docs/tutorial_classifying_names.rst index c20bf487d..b420d850b 100644 --- a/docs/tutorial_classifying_names.rst +++ b/docs/tutorial_classifying_names.rst @@ -285,7 +285,7 @@ following contents:: max_source_positions=self.args.max_positions, max_target_positions=1, # Since our target is a single class label, there's no need for - # input feeding. If we set this to ``True`` then our Model's + # teacher forcing. If we set this to ``True`` then our Model's # ``forward()`` method would receive an additional argument called # *prev_output_tokens* that would contain a shifted version of the # target sequence. diff --git a/docs/tutorial_simple_lstm.rst b/docs/tutorial_simple_lstm.rst index 57a254962..30bdc7213 100644 --- a/docs/tutorial_simple_lstm.rst +++ b/docs/tutorial_simple_lstm.rst @@ -125,9 +125,9 @@ Decoder Our Decoder will predict the next word, conditioned on the Encoder's final hidden state and an embedded representation of the previous target word -- which -is sometimes called *input feeding* or *teacher forcing*. More specifically, -we'll use a :class:`torch.nn.LSTM` to produce a sequence of hidden states that -we'll project to the size of the output vocabulary to predict each target word. +is sometimes called *teacher forcing*. More specifically, we'll use a +:class:`torch.nn.LSTM` to produce a sequence of hidden states that we'll project +to the size of the output vocabulary to predict each target word. :: @@ -171,7 +171,7 @@ we'll project to the size of the output vocabulary to predict each target word. """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention @@ -387,8 +387,8 @@ previous hidden states. In fairseq this is called :ref:`Incremental decoding`. Incremental decoding is a special mode at inference time where the Model only receives a single timestep -of input corresponding to the immediately previous output token (for input -feeding) and must produce the next output incrementally. Thus the model must +of input corresponding to the immediately previous output token (for teacher +forcing) and must produce the next output incrementally. Thus the model must cache any long-term state that is needed about the sequence, e.g., hidden states, convolutional states, etc. diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py index 351ba1e8f..64a5e4c7e 100644 --- a/fairseq/data/language_pair_dataset.py +++ b/fairseq/data/language_pair_dataset.py @@ -88,8 +88,7 @@ class LanguagePairDataset(FairseqDataset): shuffle (bool, optional): shuffle dataset elements before batching (default: True). input_feeding (bool, optional): create a shifted version of the targets - to be passed into the model for input feeding/teacher forcing - (default: True). + to be passed into the model for teacher forcing (default: True). remove_eos_from_source (bool, optional): if set, removes eos from end of source if it's present (default: False). append_eos_to_target (bool, optional): if set, appends eos to end of @@ -167,10 +166,10 @@ class LanguagePairDataset(FairseqDataset): - `src_lengths` (LongTensor): 1D Tensor of the unpadded lengths of each source sentence of shape `(bsz)` - `prev_output_tokens` (LongTensor): a padded 2D Tensor of - tokens in the target sentence, shifted right by one position - for input feeding/teacher forcing, of shape `(bsz, - tgt_len)`. This key will not be present if *input_feeding* - is ``False``. Padding will appear on the left if + tokens in the target sentence, shifted right by one + position for teacher forcing, of shape `(bsz, tgt_len)`. + This key will not be present if *input_feeding* is + ``False``. Padding will appear on the left if *left_pad_target* is ``True``. - `target` (LongTensor): a padded 2D Tensor of tokens in the diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py index 732e66a09..2e5398e36 100644 --- a/fairseq/models/fairseq_decoder.py +++ b/fairseq/models/fairseq_decoder.py @@ -22,7 +22,7 @@ class FairseqDecoder(nn.Module): """ Args: prev_output_tokens (LongTensor): shifted output tokens of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing encoder_out (dict, optional): output from the encoder, used for encoder-side attention diff --git a/fairseq/models/fairseq_incremental_decoder.py b/fairseq/models/fairseq_incremental_decoder.py index ede1b5173..1c4121557 100644 --- a/fairseq/models/fairseq_incremental_decoder.py +++ b/fairseq/models/fairseq_incremental_decoder.py @@ -13,7 +13,7 @@ class FairseqIncrementalDecoder(FairseqDecoder): Incremental decoding is a special mode at inference time where the Model only receives a single timestep of input corresponding to the previous - output token (for input feeding) and must produce the next output + output token (for teacher forcing) and must produce the next output *incrementally*. Thus the model must cache any long-term state that is needed about the sequence, e.g., hidden states, convolutional states, etc. @@ -37,7 +37,7 @@ class FairseqIncrementalDecoder(FairseqDecoder): """ Args: prev_output_tokens (LongTensor): shifted output tokens of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing encoder_out (dict, optional): output from the encoder, used for encoder-side attention incremental_state (dict, optional): dictionary used for storing diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py index 78b130861..f8bd5ba60 100644 --- a/fairseq/models/fairseq_model.py +++ b/fairseq/models/fairseq_model.py @@ -202,8 +202,8 @@ class FairseqEncoderDecoderModel(BaseFairseqModel): Run the forward pass for an encoder-decoder model. First feed a batch of source tokens through the encoder. Then, feed the - encoder output and previous decoder outputs (i.e., input feeding/teacher - forcing) to the decoder to produce the next outputs:: + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: encoder_out = self.encoder(src_tokens, src_lengths) return self.decoder(prev_output_tokens, encoder_out) @@ -213,7 +213,7 @@ class FairseqEncoderDecoderModel(BaseFairseqModel): `(batch, src_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` prev_output_tokens (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing Returns: tuple: diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py index 5b38ac5e7..0dc71a1f7 100644 --- a/fairseq/models/lightconv.py +++ b/fairseq/models/lightconv.py @@ -345,7 +345,7 @@ class LightConvDecoder(FairseqIncrementalDecoder): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py index 59e14a4e7..591a48606 100644 --- a/fairseq/models/transformer.py +++ b/fairseq/models/transformer.py @@ -370,7 +370,7 @@ class TransformerDecoder(FairseqIncrementalDecoder): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for input feeding/teacher forcing + `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during