fairseq/tests/test_fp16_optimizer.py
alexeib b58f4f017e end to end hydra configs (#1393)
Summary:
this adds a hydra_train binary that uses hydra configs/command line overrides instead of argparse

use case 1: built in configs + overrides from command line

```
python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 task.data=/private/home/myleott/data/data-bin/wikitext-103-roberta-bpe-bin/ model=transformer_lm/transformer_lm_gpt task=language_modeling optimization.max_update=5000
```

use case 2: use an external config that is used instead of bundled configs (but dataclass defaults still work)

```
python fairseq_cli/hydra_train.py --config-path ~/fairseq-py-dev/lm --config-name wiki103
```

the config file contains this:

```
# package _group_

model:
  _name: transformer_lm
distributed_training:
  distributed_world_size: 1
dataset:
  batch_size: 2
task:
  _name: language_modeling
  data: /private/home/myleott/data/data-bin/wikitext-103-roberta-bpe-bin/
  add_bos_token: false
  max_target_positions: 1024
optimization:
  max_update: 50000
  lr: [ 0.25 ]
criterion: cross_entropy
optimizer: adam
lr_scheduler:
  _name: cosine
```

use case 3: use an external config directory that provides additional configs for e.g. models

python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 task.data=/private/home/myleott/data/data-bin/wikitext-103-roberta-bpe-bin/ model=transformer_lm/2_layers task=language_modeling optimization.max_update=5000 --config-dir ~/fairseq-py-dev/lm/hydra

where ~/fairseq-py-dev/lm/hydra has the following structure:

- model
-- transformer_lm
 --- 2_layers.yaml

and inside 2_layers.yaml is a copy of transformer_lm_gpt.yaml but with decoder_layers set to 2

Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1393

Reviewed By: myleott

Differential Revision: D24722252

Pulled By: alexeib

fbshipit-source-id: 758ea431fa099cd7c0e4daf41eff680df1d3b841
2020-11-04 18:20:12 -08:00

113 lines
3.5 KiB
Python

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import copy
import logging
import unittest
import torch
from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
from omegaconf import OmegaConf
@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
class TestGradientScaling(unittest.TestCase):
def setUp(self):
self.x = torch.tensor([2.0]).cuda().half()
weight = 3.0
bias = 5.0
self.error = 1.0
self.target = torch.tensor([self.x * weight + bias + self.error]).cuda().half()
self.loss_fn = torch.nn.L1Loss()
self.model = torch.nn.Linear(1, 1)
self.model.weight.data = torch.tensor([[weight]])
self.model.bias.data = torch.tensor([bias])
self.model.cuda().half()
self.params = list(self.model.parameters())
self.cfg_dls = OmegaConf.create(
{
"optimization": {
"lr": [0.1],
},
"optimizer": {
"_name": "adam",
"lr": [0.1],
"adam_betas": "(0.9, 0.999)",
"adam_eps": 1e-8,
"weight_decay": 0.0,
},
"common": {
"fp16_init_scale": 1,
"fp16_scale_window": 1,
"fp16_scale_tolerance": 1,
"threshold_loss_scale": 1,
"min_loss_scale": 1e-4,
"tpu": False,
},
}
)
logging.disable(logging.CRITICAL)
def tearDown(self):
logging.disable(logging.NOTSET)
def run_iter(self, model, params, optimizer):
optimizer.zero_grad()
y = model(self.x)
loss = self.loss_fn(y, self.target)
optimizer.backward(loss)
self.assertEqual(loss, torch.tensor(1.0, device="cuda:0", dtype=torch.float16))
grad_norm = optimizer.clip_grad_norm(0)
self.assertAlmostEqual(grad_norm.item(), 2.2361, 4)
optimizer.step()
self.assertEqual(
model.weight,
torch.tensor(
[[3.0996]], device="cuda:0", dtype=torch.float16, requires_grad=True
),
)
self.assertEqual(
model.bias,
torch.tensor(
[5.1016], device="cuda:0", dtype=torch.float16, requires_grad=True
),
)
self.assertEqual(optimizer.scaler.loss_scale, 2.0)
def test_mixed_precision(self):
model = copy.deepcopy(self.model)
params = list(model.parameters())
optimizer = FP16Optimizer.build_optimizer(self.cfg_dls, params)
self.run_iter(model, params, optimizer)
self.assertTrue(
all(
torch.all(
fp32_params.eq(
torch.tensor(
[3.1000, 5.1000], device="cuda:0", requires_grad=True
)
)
)
for fp32_params in optimizer.fp32_params.values()
)
)
def test_memory_efficient(self):
model = copy.deepcopy(self.model)
params = list(model.parameters())
optimizer = MemoryEfficientFP16Optimizer.build_optimizer(self.cfg_dls, params)
self.run_iter(model, params, optimizer)
if __name__ == "__main__":
unittest.main()