fairseq/tests/test_concat_dataset.py

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import unittest

import torch
from fairseq.data import LanguagePairDataset, TokenBlockDataset
from fairseq.data.concat_dataset import ConcatDataset
from tests.test_train import mock_dict


class TestConcatDataset(unittest.TestCase):
    def setUp(self):
        d = mock_dict()
        tokens_1 = torch.LongTensor([1]).view(1, -1)
        tokens_ds1 = TokenBlockDataset(
            tokens_1,
            sizes=[tokens_1.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_1 = LanguagePairDataset(
            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
        )
        tokens_2 = torch.LongTensor([2]).view(1, -1)
        tokens_ds2 = TokenBlockDataset(
            tokens_2,
            sizes=[tokens_2.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_2 = LanguagePairDataset(
            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
        )

    def test_concat_dataset_basics(self):
        d = ConcatDataset(
            [self.dataset_1, self.dataset_2]
        )
        assert(len(d) == 2)
        assert(d[0]['source'][0] == 1)
        assert(d[1]['source'][0] == 2)

        d = ConcatDataset(
            [self.dataset_1, self.dataset_2], sample_ratios=[1, 2]
        )
        assert(len(d) == 3)
        assert(d[0]['source'][0] == 1)
        assert(d[1]['source'][0] == 2)
        assert(d[2]['source'][0] == 2)

        d = ConcatDataset(
            [self.dataset_1, self.dataset_2], sample_ratios=[2, 1]
        )
        assert(len(d) == 3)
        assert(d[0]['source'][0] == 1)
        assert(d[1]['source'][0] == 1)
        assert(d[2]['source'][0] == 2)
Relicense fairseq under MIT license (#786) Summary: The previous BSD+PATENTS license was controversial. We have been approved to relicense fairseq under the MIT license. Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/786 Differential Revision: D16560654 Pulled By: myleott fbshipit-source-id: f78b1beb4f2895dd7b9bfc79f5f952a2bfb94034 2019-07-30 17:45:13 +03:00			`# Copyright (c) Facebook, Inc. and its affiliates.`
Make ConcatDataset work in PytorchTranslateTask multi-path dataset loading (#730) Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/730 Pull Request resolved: https://github.com/pytorch/translate/pull/528 Add/modify necessary functions for ConcatDataset to work in PytorchTranslateTask and replace MultiCorpusSampledDataset which doesn't support mixed batch. Any idea on how to implement collater here for mixed batch? Now I'm just using the collater of the first dataset. Reviewed By: liezl200 Differential Revision: D15260872 fbshipit-source-id: 14b148c506e9f8ebf4fe60a49f95444d4123d76f 2019-05-20 21:24:41 +03:00			`#`
Relicense fairseq under MIT license (#786) Summary: The previous BSD+PATENTS license was controversial. We have been approved to relicense fairseq under the MIT license. Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/786 Differential Revision: D16560654 Pulled By: myleott fbshipit-source-id: f78b1beb4f2895dd7b9bfc79f5f952a2bfb94034 2019-07-30 17:45:13 +03:00			`# This source code is licensed under the MIT license found in the`
			`# LICENSE file in the root directory of this source tree.`
Make ConcatDataset work in PytorchTranslateTask multi-path dataset loading (#730) Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/730 Pull Request resolved: https://github.com/pytorch/translate/pull/528 Add/modify necessary functions for ConcatDataset to work in PytorchTranslateTask and replace MultiCorpusSampledDataset which doesn't support mixed batch. Any idea on how to implement collater here for mixed batch? Now I'm just using the collater of the first dataset. Reviewed By: liezl200 Differential Revision: D15260872 fbshipit-source-id: 14b148c506e9f8ebf4fe60a49f95444d4123d76f 2019-05-20 21:24:41 +03:00
			`import unittest`

			`import torch`
			`from fairseq.data import LanguagePairDataset, TokenBlockDataset`
			`from fairseq.data.concat_dataset import ConcatDataset`
			`from tests.test_train import mock_dict`


			`class TestConcatDataset(unittest.TestCase):`
			`def setUp(self):`
			`d = mock_dict()`
			`tokens_1 = torch.LongTensor([1]).view(1, -1)`
			`tokens_ds1 = TokenBlockDataset(`
			`tokens_1,`
			`sizes=[tokens_1.size(-1)],`
			`block_size=1,`
			`pad=0,`
			`eos=1,`
			`include_targets=False,`
			`)`
			`self.dataset_1 = LanguagePairDataset(`
			`tokens_ds1, tokens_ds1.sizes, d, shuffle=False`
			`)`
			`tokens_2 = torch.LongTensor([2]).view(1, -1)`
			`tokens_ds2 = TokenBlockDataset(`
			`tokens_2,`
			`sizes=[tokens_2.size(-1)],`
			`block_size=1,`
			`pad=0,`
			`eos=1,`
			`include_targets=False,`
			`)`
			`self.dataset_2 = LanguagePairDataset(`
			`tokens_ds2, tokens_ds2.sizes, d, shuffle=False`
			`)`

			`def test_concat_dataset_basics(self):`
			`d = ConcatDataset(`
			`[self.dataset_1, self.dataset_2]`
			`)`
			`assert(len(d) == 2)`
			`assert(d[0]['source'][0] == 1)`
			`assert(d[1]['source'][0] == 2)`

			`d = ConcatDataset(`
			`[self.dataset_1, self.dataset_2], sample_ratios=[1, 2]`
			`)`
			`assert(len(d) == 3)`
			`assert(d[0]['source'][0] == 1)`
			`assert(d[1]['source'][0] == 2)`
			`assert(d[2]['source'][0] == 2)`

			`d = ConcatDataset(`
			`[self.dataset_1, self.dataset_2], sample_ratios=[2, 1]`
			`)`
			`assert(len(d) == 3)`
			`assert(d[0]['source'][0] == 1)`
			`assert(d[1]['source'][0] == 1)`
			`assert(d[2]['source'][0] == 2)`