fix flake8 issues (#2570)

Summary: # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? ## What does this PR do? - [x] applies flake8 fixes to main branch (https://github.com/fairinternal/fairseq-py/issues/2546) - still more to be fixed Fix GPU tests: - [x] when torch.ao.quantization import doesn't work use torch.quantization - [x] build apex from earlier commit in circleci so that its compatible with pytorch 1.8 and 1.9 ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding � Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/2570 Reviewed By: Mortimerp9 Differential Revision: D32955312 Pulled By: dianaml0 fbshipit-source-id: e163cbd4998f171f819e31b0682c1c0f1986f9e1
2024-10-03 20:28:26 +03:00 · 2021-12-09 02:33:35 -08:00 · 2021-12-09 02:33:35 -08:00 · 88e7d2586b
commit 88e7d2586b
parent c620ed066f
22 changed files with 73 additions and 61 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -10,7 +10,7 @@ gpu: &gpu
  machine:
    image: ubuntu-1604-cuda-11.1:202012-01
  resource_class: gpu.nvidia.medium.multi
-  
+

 # -------------------------------------------------------------------------------------
 # Re-usable commands
@ -25,7 +25,7 @@ install_dep_common: &install_dep_common
        pip install --upgrade setuptools
        pip install bitarray boto3 deepspeed editdistance fastBPE iopath ipdb ipython pyarrow pytest sacremoses sentencepiece subword-nmt hydra-core==1.0.7 omegaconf==2.0.6
        pip install --progress-bar off pytest
-        pip install --progress-bar off fairscale==0.4.1
+        pip install --progress-bar off fairscale
        pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U
        python -c 'import torch; print("Torch version:", torch.__version__)'
        python -m torch.utils.collect_env
@ -38,6 +38,7 @@ install_dep_fused_ops: &install_dep_fused_ops
        source activate fairseq
        git clone https://github.com/NVIDIA/apex
        cd apex
+        git checkout e2083df5eb96643c61613b9df48dd4eea6b07690
        pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
        cd ~/
        git clone --depth=1 --branch v2.4 https://github.com/NVIDIA/Megatron-LM.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -17,7 +17,7 @@ repos:
    -   id: end-of-file-fixer

 -   repo: https://github.com/ambv/black
-    rev: 20.8b1
+    rev: 21.12b0
    hooks:
    - id: black
      language_version: python3.8
--- a/fairseq/logging/meters.py
+++ b/fairseq/logging/meters.py
@ -8,7 +8,6 @@ import time
 from collections import OrderedDict
 from typing import Dict, Optional

-
 try:
    import torch

@ -18,7 +17,6 @@ try:
        else:
            return a

-
 except ImportError:
    torch = None

--- a/fairseq/models/speech_to_text/modules/emformer.py
+++ b/fairseq/models/speech_to_text/modules/emformer.py
@ -14,23 +14,30 @@ from typing import List, Optional, Tuple

 import torch
 import torch.nn as nn
-from fairseq.models import (
-    FairseqEncoder,
-)
+from torch import Tensor
+from torch import device as Device
+
+from fairseq.models import FairseqEncoder
 from fairseq.models.speech_to_text.utils import (
    NoOp,
+    attention_suppression,
+    layer_norm_backward_hook,
    lengths_to_padding_mask,
    segments_to_sequence,
 )
-from fairseq.models.speech_to_text.utils import (
-    attention_suppression,
-    layer_norm_backward_hook,
-)
-from torch import Tensor, device as Device
-from torch.ao.quantization.qconfig import (
-    default_dynamic_qconfig,
-    per_channel_dynamic_qconfig,
-)
+
+try:
+    import torch.ao.quantization as quantization
+    from torch.ao.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
+except ImportError:
+    import torch.quantization as quantization
+    from torch.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )


 class RelativePositionEmbedding(nn.Module):
@ -140,7 +147,7 @@ class PositionwiseFF(nn.Module):
            qconfig = per_channel_dynamic_qconfig
        else:
            qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
        )
        return self
@ -728,7 +735,7 @@ class NoSegAugmentedMemoryMultiheadAttentionBmm(nn.Module):
            qconfig = per_channel_dynamic_qconfig
        else:
            qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
        )
        return self
@ -1771,7 +1778,7 @@ class NoSegAugmentedMemoryTransformerEncoderLayer(FairseqEncoder):
            qconfig = per_channel_dynamic_qconfig
        else:
            qconfig = default_dynamic_qconfig
-        torch.ao.quantization.quantize_dynamic(
+        quantization.quantize_dynamic(
            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
        )
        return self
--- a/fairseq/modules/cross_entropy.py
+++ b/fairseq/modules/cross_entropy.py
@ -8,7 +8,6 @@ import logging
 import torch
 import torch.nn.functional as F

-
 logger = logging.getLogger(__name__)


@ -54,7 +53,6 @@ try:
            else:
                raise NotImplementedError

-
 except ImportError:

    def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
--- a/fairseq/modules/layer_norm.py
+++ b/fairseq/modules/layer_norm.py
@ -7,7 +7,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-
 try:
    from apex.normalization import FusedLayerNorm as _FusedLayerNorm

@ -22,7 +21,6 @@ try:
                with torch.cuda.device(x.device):
                    return super().forward(x)

-
 except ImportError:
    has_fused_layernorm = False

--- a/fairseq/modules/quantization/scalar/ops.py
+++ b/fairseq/modules/quantization/scalar/ops.py
@ -5,6 +5,11 @@

 import torch

+try:
+    import torch.ao.quantization as quantization
+except ImportError:
+    import torch.quantization as quantization
+

 def emulate_int(w, bits, method, scale=None, zero_point=None):
    q = globals()[f"emulate_int8_{method}"]
@ -21,7 +26,7 @@ def quantize(w, scale, zero_point, bits=8):

 def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
    if scale is None:
-        obs = torch.ao.quantization.observer.HistogramObserver()
+        obs = quantization.observer.HistogramObserver()
        obs.to(device=w.device)
        _ = obs(w.float())
        scale, zero_point = obs.calculate_qparams()
@ -32,7 +37,7 @@ def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):

 def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
    if scale is None:
-        obs = torch.ao.quantization.observer.PerChannelMinMaxObserver(
+        obs = quantization.observer.PerChannelMinMaxObserver(
            ch_axis=-1, qscheme=torch.per_channel_symmetric
        )
        obs.to(device=w.device)
@ -45,7 +50,7 @@ def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):

 def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8):
    if scale is None:
-        obs = torch.ao.quantization.observer.MinMaxObserver()
+        obs = quantization.observer.MinMaxObserver()
        obs.to(device=w.device)
        _ = obs(w)
        scale, zero_point = obs.calculate_qparams()
--- a/fairseq/optim/fused_adam.py
+++ b/fairseq/optim/fused_adam.py
@ -27,8 +27,8 @@ def get_fused_adam_class():
    except ImportError:
        try:
            # fallback to the newer interface
-            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
            from apex.multi_tensor_apply import multi_tensor_applier
+            from apex.optimizers import FusedAdam as _FusedAdam  # noqa

            if multi_tensor_applier.available:
                return FusedAdamV2
@ -252,8 +252,8 @@ class FusedAdamV1(torch.optim.Optimizer):


 try:
-    from apex.optimizers import FusedAdam
    from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.optimizers import FusedAdam

    class FusedAdamV2(FusedAdam):
        """
@ -382,6 +382,5 @@ try:

            return loss

-
 except ImportError:
    pass
--- a/scripts/average_checkpoints.py
+++ b/scripts/average_checkpoints.py
@ -108,16 +108,18 @@ def main():
                        help='Write the new checkpoint containing the averaged weights to this path.')
    num_group = parser.add_mutually_exclusive_group()
    num_group.add_argument('--num-epoch-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
+                           'path specified by input, and average last this many of them.')
    num_group.add_argument('--num-update-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
    parser.add_argument('--checkpoint-upper-bound', type=int,
                        help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
                        'when using --num-update-checkpoints, this will set an upper bound on which update to use'
-                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
-                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
+                        ' averaged.'
+                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
+                        ' be averaged assuming --save-interval-updates 500'
                        )
    # fmt: on
    args = parser.parse_args()
--- a/scripts/constraints/extract.py
+++ b/scripts/constraints/extract.py
@ -11,8 +11,6 @@ import argparse
 import random
 import sys

-from sacrebleu import extract_ngrams
-

 def get_phrase(words, index, length):
    assert index < len(words) - length + 1
--- a/scripts/spm_decode.py
+++ b/scripts/spm_decode.py
@ -26,13 +26,13 @@ def main():

    if args.input_format == "piece":

-        def decode(l):
-            return "".join(sp.DecodePieces(l))
+        def decode(input):
+            return "".join(sp.DecodePieces(input))

    elif args.input_format == "id":

-        def decode(l):
-            return "".join(sp.DecodeIds(l))
+        def decode(input):
+            return "".join(sp.DecodeIds(input))

    else:
        raise NotImplementedError
--- a/scripts/spm_encode.py
+++ b/scripts/spm_encode.py
@ -49,13 +49,13 @@ def main():

    if args.output_format == "piece":

-        def encode(l):
-            return sp.EncodeAsPieces(l)
+        def encode(input):
+            return sp.EncodeAsPieces(input)

    elif args.output_format == "id":

-        def encode(l):
-            return list(map(str, sp.EncodeAsIds(l)))
+        def encode(input):
+            return list(map(str, sp.EncodeAsIds(input)))

    else:
        raise NotImplementedError
--- a/setup.cfg
+++ b/setup.cfg
@ -1,3 +1,4 @@
 [flake8]
 max-line-length = 127
 extend-ignore = E203, W503
+extend-exclude = fairseq/model_parallel/megatron
--- a/tests/distributed/test_bmuf.py
+++ b/tests/distributed/test_bmuf.py
@ -140,7 +140,6 @@ def setup_args():
@unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs")
 class TestBMUF(unittest.TestCase):
    def bmuf_process(self, cfg, args, iterations):
-        processes = []
        results = Manager().dict()
        torch.multiprocessing.spawn(
            fn=functools.partial(single_gpu_training, cfg, args),
--- a/tests/gpu/test_binaries_gpu.py
+++ b/tests/gpu/test_binaries_gpu.py
@ -399,6 +399,9 @@ def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=Fa
    train.main(quantize_args)


+@unittest.skipIf(
+    int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU"
+)
@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
 class TestQuantization(unittest.TestCase):
    def setUp(self):
--- a/tests/test_constraints.py
+++ b/tests/test_constraints.py
@ -3,11 +3,17 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.

-import sys
 import unittest
+from typing import List

 import torch
-from fairseq.token_generation_constraints import *
+
+from fairseq.token_generation_constraints import (
+    ConstraintNode,
+    OrderedConstraintState,
+    UnorderedConstraintState,
+    pack_constraints,
+)


 def tensorize(constraints: List[List[int]]) -> torch.Tensor:
@ -53,7 +59,7 @@ class TestUnorderedConstraintState(unittest.TestCase):
        self.examples = [
            (
                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",
+                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",  # noqa
                {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1},
            ),
            ([], "[None].False#0", {}),
--- a/tests/test_file_io.py
+++ b/tests/test_file_io.py
@ -49,7 +49,7 @@ class TestFileIO(unittest.TestCase):
    def test_file_io_async(self):
        # ioPath `PathManager` is initialized after the first `opena` call.
        try:
-            from fairseq.file_io import IOPathManager, PathManager
+            from fairseq.file_io import PathManager

            _asyncfile = os.path.join(self._tmpdir, "async.txt")
            f = PathManager.opena(_asyncfile, "wb")
--- a/tests/test_fp16_optimizer.py
+++ b/tests/test_fp16_optimizer.py
@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.

-import argparse
 import copy
 import logging
 import unittest
--- a/tests/test_multi_corpus_sampled_dataset.py
+++ b/tests/test_multi_corpus_sampled_dataset.py
@ -79,7 +79,7 @@ class TestMultiCorpusSampledDataset(unittest.TestCase):

    def test_multi_corpus_sampled_dataset_weighted_sample(self):
        def naive_weighted_sample(weights):
-            def f(l):
+            def f(input):
                v = np.random.random()
                agg = 0
                for i, weight in enumerate(weights):
--- a/tests/test_reproducibility.py
+++ b/tests/test_reproducibility.py
@ -3,12 +3,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.

-import contextlib
 import json
 import os
 import tempfile
 import unittest
-from io import StringIO

 import torch

--- a/tests/test_roberta.py
+++ b/tests/test_roberta.py
@ -292,18 +292,18 @@ class RobertaTest(unittest.TestCase):
        # Decode with incremental state
        inc_state = {}
        ro_dec_inc = []
-        for l in range(tgt_len):
+        for i in range(tgt_len):
            ro, _ = model.decoder.forward(
-                ro_tokens[:, : l + 1], encoder_out=en_enc, incremental_state=inc_state
+                ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state
            )
            self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE))
            ro_dec_inc.append(ro)

-        for l in range(tgt_len):
+        for i in range(tgt_len):
            # Intra-batch
-            self.assertTensorEqual(ro_dec_inc[l][0], ro_dec_inc[l][1])
+            self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1])
            # Incremental vs non-incremental
-            self.assertTensorEqual(ro_dec_inc[l][:, 0], ro_dec[:, l])
+            self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i])


 def params(model, name):
--- a/tests/test_sequence_generator.py
+++ b/tests/test_sequence_generator.py
@ -320,7 +320,7 @@ class TestSequenceGenerator(TestSequenceGeneratorBase):
        sample = self.sample.copy()
        sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"]
        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
+        eos, w1 = self.tgt_dict.eos(), self.w1
        # sentence 1, beam 1
        self.assertHypoTokens(hypos[0][0], [w1, eos])
        self.assertHypoScore(hypos[0][0], [0.9, 1.0])