fairseq/tests/test_sparse_multihead_attention.py

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import torch
import unittest
from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention


class TestSparseMultiheadAttention(unittest.TestCase):
    def test_sparse_multihead_attention(self):
        attn_weights = torch.randn(1, 8, 8)
        bidirectional_sparse_mask = torch.tensor([
                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0]
            ])

        bidirectional_attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=True)
        bidirectional_attention_sparse_mask = bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8)
        torch.all(torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask))

        sparse_mask = torch.tensor([
                [0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'),
                 float('-inf'), float('-inf')],
                [0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],
                [0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],
                [0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf')],
                [0, 0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf')],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, float('-inf'), float('-inf')],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, float('-inf')],
                [float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],
            ])

        attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=False)
        attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8)

        torch.all(torch.eq(attention_sparse_mask, sparse_mask))


if __name__ == '__main__':
    unittest.main()
Relicense fairseq under MIT license (#786) Summary: The previous BSD+PATENTS license was controversial. We have been approved to relicense fairseq under the MIT license. Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/786 Differential Revision: D16560654 Pulled By: myleott fbshipit-source-id: f78b1beb4f2895dd7b9bfc79f5f952a2bfb94034 2019-07-30 17:45:13 +03:00			`# Copyright (c) Facebook, Inc. and its affiliates.`
Implement sparse transformer fixed attention pattern (#804) Summary: Pull Request resolved: https://github.com/facebookresearch/pytext/pull/804 Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/746 Pull Request resolved: https://github.com/pytorch/fairseq/pull/894 Adding an implementation of the sparse transformer to multi-head attention using the fixed attention pattern specified https://arxiv.org/pdf/1904.10509.pdf. The sparse_mask masks out words using -inf; after softmax, -inf becomes 0. Thus, a mask does not need to be re-calculated and re-applied when multiplying attn_weights and values. Four inputs are added to the config: sparse, is_bidirectional, stride, expressivity. If we are using the sparse transformer, is_bidirectional, stride, and expressivity must be specified (there are defaults). If is_bidirectional is False, the mask values using the fixed attention pattern described in the paper. If is_bidirectional is True, subset one includes all values in the current stride window and a summary from every stride window--all other values are masked. Stride (L in the paper) controls the window size and expressivity (c in the paper) controls the size of the summary. Reviewed By: borguz Differential Revision: D16042988 fbshipit-source-id: c59166dc7cfe89187a256e4076000c2458842fd5 2019-07-23 02:36:08 +03:00			`#`
Relicense fairseq under MIT license (#786) Summary: The previous BSD+PATENTS license was controversial. We have been approved to relicense fairseq under the MIT license. Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/786 Differential Revision: D16560654 Pulled By: myleott fbshipit-source-id: f78b1beb4f2895dd7b9bfc79f5f952a2bfb94034 2019-07-30 17:45:13 +03:00			`# This source code is licensed under the MIT license found in the`
			`# LICENSE file in the root directory of this source tree.`
Implement sparse transformer fixed attention pattern (#804) Summary: Pull Request resolved: https://github.com/facebookresearch/pytext/pull/804 Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/746 Pull Request resolved: https://github.com/pytorch/fairseq/pull/894 Adding an implementation of the sparse transformer to multi-head attention using the fixed attention pattern specified https://arxiv.org/pdf/1904.10509.pdf. The sparse_mask masks out words using -inf; after softmax, -inf becomes 0. Thus, a mask does not need to be re-calculated and re-applied when multiplying attn_weights and values. Four inputs are added to the config: sparse, is_bidirectional, stride, expressivity. If we are using the sparse transformer, is_bidirectional, stride, and expressivity must be specified (there are defaults). If is_bidirectional is False, the mask values using the fixed attention pattern described in the paper. If is_bidirectional is True, subset one includes all values in the current stride window and a summary from every stride window--all other values are masked. Stride (L in the paper) controls the window size and expressivity (c in the paper) controls the size of the summary. Reviewed By: borguz Differential Revision: D16042988 fbshipit-source-id: c59166dc7cfe89187a256e4076000c2458842fd5 2019-07-23 02:36:08 +03:00
			`import torch`
			`import unittest`
			`from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention`


			`class TestSparseMultiheadAttention(unittest.TestCase):`
			`def test_sparse_multihead_attention(self):`
			`attn_weights = torch.randn(1, 8, 8)`
			`bidirectional_sparse_mask = torch.tensor([`
			`[0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],`
			`[0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],`
			`[0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],`
			`[0, 0, 0, 0, 0, float('-inf'), float('-inf'), 0],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0]`
			`])`

			`bidirectional_attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=True)`
			`bidirectional_attention_sparse_mask = bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8)`
			`torch.all(torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask))`

			`sparse_mask = torch.tensor([`
			`[0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'),`
			`float('-inf'), float('-inf')],`
			`[0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],`
			`[0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf'), float('-inf')],`
			`[0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf'), float('-inf')],`
			`[0, 0, 0, 0, 0, float('-inf'), float('-inf'), float('-inf')],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, float('-inf'), float('-inf')],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, float('-inf')],`
			`[float('-inf'), float('-inf'), float('-inf'), 0, 0, 0, 0, 0],`
			`])`

			`attention = SparseMultiheadAttention(16, 1, stride=4, expressivity=1, is_bidirectional=False)`
			`attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8)`

			`torch.all(torch.eq(attention_sparse_mask, sparse_mask))`


			`if __name__ == '__main__':`
			`unittest.main()`