fairseq/tests/test_backtranslation_dataset.py

# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.

import unittest

import torch

from fairseq.data import (
    BacktranslationDataset,
    LanguagePairDataset,
    TransformEosDataset,
)
from fairseq.sequence_generator import SequenceGenerator

import tests.utils as test_utils


class TestBacktranslationDataset(unittest.TestCase):

    def setUp(self):
        self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = (
            test_utils.sequence_generator_setup()
        )

        dummy_src_samples = self.src_tokens

        self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
        self.cuda = torch.cuda.is_available()

    def _backtranslation_dataset_helper(
        self, remove_eos_from_input_src, remove_eos_from_output_src,
    ):
        tgt_dataset = LanguagePairDataset(
            src=self.tgt_dataset,
            src_sizes=self.tgt_dataset.sizes,
            src_dict=self.tgt_dict,
            tgt=None,
            tgt_sizes=None,
            tgt_dict=None,
        )

        generator = SequenceGenerator(
            models=[self.model],
            tgt_dict=self.tgt_dict,
            beam_size=2,
            unk_penalty=0,
            sampling=False,
        )
        if self.cuda:
            generator.cuda()

        backtranslation_dataset = BacktranslationDataset(
            tgt_dataset=TransformEosDataset(
                dataset=tgt_dataset,
                eos=self.tgt_dict.eos(),
                # remove eos from the input src
                remove_eos_from_src=remove_eos_from_input_src,
            ),
            backtranslation_fn=generator.generate,
            max_len_a=0,
            max_len_b=200,
            output_collater=TransformEosDataset(
                dataset=tgt_dataset,
                eos=self.tgt_dict.eos(),
                # if we remove eos from the input src, then we need to add it
                # back to the output tgt
                append_eos_to_tgt=remove_eos_from_input_src,
                remove_eos_from_src=remove_eos_from_output_src,
            ).collater,
            cuda=self.cuda,
        )
        dataloader = torch.utils.data.DataLoader(
            backtranslation_dataset,
            batch_size=2,
            collate_fn=backtranslation_dataset.collater,
        )
        backtranslation_batch_result = next(iter(dataloader))

        eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(), self.w1, self.w2

        # Note that we sort by src_lengths and add left padding, so actually
        # ids will look like: [1, 0]
        expected_src = torch.LongTensor([[w1, w2, w1, eos], [pad, pad, w1, eos]])
        if remove_eos_from_output_src:
            expected_src = expected_src[:, :-1]
        expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
        generated_src = backtranslation_batch_result["net_input"]["src_tokens"]
        tgt_tokens = backtranslation_batch_result["target"]

        self.assertTensorEqual(expected_src, generated_src)
        self.assertTensorEqual(expected_tgt, tgt_tokens)

    def test_backtranslation_dataset_no_eos_in_output_src(self):
        self._backtranslation_dataset_helper(
            remove_eos_from_input_src=False, remove_eos_from_output_src=True,
        )

    def test_backtranslation_dataset_with_eos_in_output_src(self):
        self._backtranslation_dataset_helper(
            remove_eos_from_input_src=False, remove_eos_from_output_src=False,
        )

    def test_backtranslation_dataset_no_eos_in_input_src(self):
        self._backtranslation_dataset_helper(
            remove_eos_from_input_src=True, remove_eos_from_output_src=False,
        )

    def assertTensorEqual(self, t1, t2):
        self.assertEqual(t1.size(), t2.size(), "size mismatch")
        self.assertEqual(t1.ne(t2).long().sum(), 0)


if __name__ == "__main__":
    unittest.main()
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`# Copyright (c) 2017-present, Facebook, Inc.`
			`# All rights reserved.`
			`#`
			`# This source code is licensed under the license found in the LICENSE file in`
			`# the root directory of this source tree. An additional grant of patent rights`
			`# can be found in the PATENTS file in the same directory.`

			`import unittest`

			`import torch`
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00
			`from fairseq.data import (`
			`BacktranslationDataset,`
			`LanguagePairDataset,`
			`TransformEosDataset,`
			`)`
			`from fairseq.sequence_generator import SequenceGenerator`

			`import tests.utils as test_utils`
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00

			`class TestBacktranslationDataset(unittest.TestCase):`
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`def setUp(self):`
			`self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = (`
			`test_utils.sequence_generator_setup()`
			`)`

			`dummy_src_samples = self.src_tokens`

			`self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)`
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`self.cuda = torch.cuda.is_available()`
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`def _backtranslation_dataset_helper(`
			`self, remove_eos_from_input_src, remove_eos_from_output_src,`
			`):`
			`tgt_dataset = LanguagePairDataset(`
			`src=self.tgt_dataset,`
			`src_sizes=self.tgt_dataset.sizes,`
			`src_dict=self.tgt_dict,`
			`tgt=None,`
			`tgt_sizes=None,`
			`tgt_dict=None,`
			`)`

			`generator = SequenceGenerator(`
			`models=[self.model],`
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`tgt_dict=self.tgt_dict,`
Pass in kwargs and SequenceGenerator class to init BacktranslationDataset Summary: This generalizes BacktranslationDataset to allow us to use any SequenceGenerator class. For example, if we want to use this model in PyTorch Translate, we can pass the following to BacktraanslationDataset init: (1) a PyTorch Translate SequenceGenerator class as generator_class and (2) the appropriate args for initializing that class as kwargs. Reviewed By: xianxl Differential Revision: D10156552 fbshipit-source-id: 0495d825bf4727da96d0d9a40dc434135ff3486c 2018-10-03 04:20:01 +03:00			`beam_size=2,`
			`unk_penalty=0,`
			`sampling=False,`
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`)`
			`if self.cuda:`
			`generator.cuda()`

			`backtranslation_dataset = BacktranslationDataset(`
			`tgt_dataset=TransformEosDataset(`
			`dataset=tgt_dataset,`
			`eos=self.tgt_dict.eos(),`
			`# remove eos from the input src`
			`remove_eos_from_src=remove_eos_from_input_src,`
			`),`
			`backtranslation_fn=generator.generate,`
			`max_len_a=0,`
			`max_len_b=200,`
			`output_collater=TransformEosDataset(`
			`dataset=tgt_dataset,`
			`eos=self.tgt_dict.eos(),`
			`# if we remove eos from the input src, then we need to add it`
			`# back to the output tgt`
			`append_eos_to_tgt=remove_eos_from_input_src,`
			`remove_eos_from_src=remove_eos_from_output_src,`
			`).collater,`
			`cuda=self.cuda,`
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`)`
			`dataloader = torch.utils.data.DataLoader(`
			`backtranslation_dataset,`
			`batch_size=2,`
			`collate_fn=backtranslation_dataset.collater,`
			`)`
			`backtranslation_batch_result = next(iter(dataloader))`

			`eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(), self.w1, self.w2`

			`# Note that we sort by src_lengths and add left padding, so actually`
			`# ids will look like: [1, 0]`
			`expected_src = torch.LongTensor([[w1, w2, w1, eos], [pad, pad, w1, eos]])`
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`if remove_eos_from_output_src:`
Option to remove EOS at source in backtranslation dataset Summary: If we want our parallel data to have EOS at the end of source, we keep the EOS at the end of the generated source dialect backtranslation. If we don't want our parallel data to have EOS at the end of source, we remove the EOS at the end of the generated source dialect backtranslation. Note: we always want EOS at the end of our target / reference in parallel data so our model can learn to generate a sentence at any arbitrary length. So we make sure that the original target has an EOS before returning a batch of {generated src, original target}. If our original targets in tgt dataset doesn't have an EOS, we append EOS to each tgt sample before collating. We only do this for the purpose of collating a {generated src, original tgt} batch AFTER generating the backtranslations. We don't enforce any EOS before passing tgt to the tgt->src model for generating the backtranslation. The users of this dataset is expected to format tgt dataset examples in the correct format that the tgt->src model expects. Reviewed By: jmp84 Differential Revision: D10157725 fbshipit-source-id: eb6a15f13c651f7c435b8db28103c9a8189845fb 2018-10-04 04:18:03 +03:00			`expected_src = expected_src[:, :-1]`
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])`
			`generated_src = backtranslation_batch_result["net_input"]["src_tokens"]`
			`tgt_tokens = backtranslation_batch_result["target"]`

			`self.assertTensorEqual(expected_src, generated_src)`
			`self.assertTensorEqual(expected_tgt, tgt_tokens)`

Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`def test_backtranslation_dataset_no_eos_in_output_src(self):`
			`self._backtranslation_dataset_helper(`
			`remove_eos_from_input_src=False, remove_eos_from_output_src=True,`
			`)`

			`def test_backtranslation_dataset_with_eos_in_output_src(self):`
			`self._backtranslation_dataset_helper(`
			`remove_eos_from_input_src=False, remove_eos_from_output_src=False,`
			`)`
Option to remove EOS at source in backtranslation dataset Summary: If we want our parallel data to have EOS at the end of source, we keep the EOS at the end of the generated source dialect backtranslation. If we don't want our parallel data to have EOS at the end of source, we remove the EOS at the end of the generated source dialect backtranslation. Note: we always want EOS at the end of our target / reference in parallel data so our model can learn to generate a sentence at any arbitrary length. So we make sure that the original target has an EOS before returning a batch of {generated src, original target}. If our original targets in tgt dataset doesn't have an EOS, we append EOS to each tgt sample before collating. We only do this for the purpose of collating a {generated src, original tgt} batch AFTER generating the backtranslations. We don't enforce any EOS before passing tgt to the tgt->src model for generating the backtranslation. The users of this dataset is expected to format tgt dataset examples in the correct format that the tgt->src model expects. Reviewed By: jmp84 Differential Revision: D10157725 fbshipit-source-id: eb6a15f13c651f7c435b8db28103c9a8189845fb 2018-10-04 04:18:03 +03:00
Refactor BacktranslationDataset to be more reusable (#354) Summary: - generalize AppendEosDataset -> TransformEosDataset - remove EOS logic from BacktranslationDataset (use TransformEosDataset instead) - BacktranslationDataset takes a backtranslation_fn instead of building the SequenceGenerator itself Pull Request resolved: https://github.com/pytorch/fairseq/pull/354 Reviewed By: liezl200 Differential Revision: D12970233 Pulled By: myleott fbshipit-source-id: d5c5b0e0a75eca1bd3a50382ac24621f35c32f36 2018-11-26 08:24:24 +03:00			`def test_backtranslation_dataset_no_eos_in_input_src(self):`
			`self._backtranslation_dataset_helper(`
			`remove_eos_from_input_src=True, remove_eos_from_output_src=False,`
			`)`
Option to remove EOS at source in backtranslation dataset Summary: If we want our parallel data to have EOS at the end of source, we keep the EOS at the end of the generated source dialect backtranslation. If we don't want our parallel data to have EOS at the end of source, we remove the EOS at the end of the generated source dialect backtranslation. Note: we always want EOS at the end of our target / reference in parallel data so our model can learn to generate a sentence at any arbitrary length. So we make sure that the original target has an EOS before returning a batch of {generated src, original target}. If our original targets in tgt dataset doesn't have an EOS, we append EOS to each tgt sample before collating. We only do this for the purpose of collating a {generated src, original tgt} batch AFTER generating the backtranslations. We don't enforce any EOS before passing tgt to the tgt->src model for generating the backtranslation. The users of this dataset is expected to format tgt dataset examples in the correct format that the tgt->src model expects. Reviewed By: jmp84 Differential Revision: D10157725 fbshipit-source-id: eb6a15f13c651f7c435b8db28103c9a8189845fb 2018-10-04 04:18:03 +03:00
Online backtranslation module Co-authored-by: liezl200 <lie@fb.com> 2018-09-25 21:02:34 +03:00			`def assertTensorEqual(self, t1, t2):`
			`self.assertEqual(t1.size(), t2.size(), "size mismatch")`
			`self.assertEqual(t1.ne(t2).long().sum(), 0)`


			`if __name__ == "__main__":`
			`unittest.main()`