Fix BMUF using 1 GPU

Summary:
With 1 GPU, BMUF is no longer required. Instead, it just works like a simple model training.

Add unit test too for Single GPU BMUF

Reviewed By: jay-mahadeokar

Differential Revision: D21033060

fbshipit-source-id: 9030187c05d49548222c8d1e2fe9534a6c6c4389
This commit is contained in:
Nayan Singhal 2020-04-16 11:22:59 -07:00 committed by Facebook GitHub Bot
parent f57ac6ef3f
commit 89e75fa315
2 changed files with 30 additions and 8 deletions

View File

@ -32,6 +32,7 @@ class FairseqBMUF(FairseqOptimizer):
self.use_nbm = self.args.use_nbm
self.initial_state = self._optimizer.state_dict()
self.average_sync = self.args.average_sync
self.world_size = self.args.distributed_world_size
@staticmethod
def add_args(parser):
@ -103,6 +104,8 @@ class FairseqBMUF(FairseqOptimizer):
self._optimizer.average_params()
def _block_sync(self):
if self.world_size <= 1:
return
# Update the global model using local models from all GPUs
# (Step-1) Calculate grad between previously synced model and
# currrent local model
@ -135,6 +138,8 @@ class FairseqBMUF(FairseqOptimizer):
return False
def _warmup_sync(self, root_rank=0):
if self.world_size <= 1:
return
# Broadcast the local model to all gpus
for param in self.params:
dist.broadcast(param.data, src=root_rank)

View File

@ -29,7 +29,8 @@ def setup_model_loss_criterion(args, rank, is_cuda):
setup model, criterion and optimizer based on input args
"""
args.distributed_rank = rank
distributed_utils.distributed_init(args)
if args.distributed_world_size > 1:
distributed_utils.distributed_init(args)
torch.manual_seed(1)
model = Model(args.input_size, args.nb_classes)
loss_fn = nn.CrossEntropyLoss()
@ -94,6 +95,7 @@ def setup_args():
args.use_nbm = True
args.average_sync = True
args.global_sync_iter = 1
args.model_parallel_size = 1
args.distributed_backend = "gloo"
args.distributed_world_size = 2
@ -120,23 +122,26 @@ class TestBMUF(unittest.TestCase):
for p in processes:
p.join()
# Make sure params in both machines are same
assert len(results) == 2
self.assertAlmostEqual(results[0], results[1])
return results
def test_bmuf_sync(self):
# Train model for 1 iteration and do bmuf sync without doing warmup
args = setup_args()
iterations = 1
self.bmuf_process(args, iterations)
results = self.bmuf_process(args, iterations)
# Make sure params in both machines are same
assert len(results) == 2
self.assertAlmostEqual(results[0], results[1])
def test_warmup_sync(self):
# Train model for 20 iteration and do warmup sync without doing bmuf sync
args = setup_args()
args.warmup_iterations = 20
iterations = 20
self.bmuf_process(args, iterations)
results = self.bmuf_process(args, iterations)
# Make sure params in both machines are same
assert len(results) == 2
self.assertAlmostEqual(results[0], results[1])
def test_warmup_sync_bmuf_sync(self):
# Train model for 25 iteration and do warmup sync after 20 iteration
@ -145,7 +150,19 @@ class TestBMUF(unittest.TestCase):
args.warmup_iterations = 20
args.global_sync_iter = 5
iterations = 25
self.bmuf_process(args, iterations)
results = self.bmuf_process(args, iterations)
# Make sure params in both machines are same
assert len(results) == 2
self.assertAlmostEqual(results[0], results[1])
def test_single_gpu_bmuf(self):
# Train model for 5 iterations and use GPU 1
args = setup_args()
args.distributed_world_size = 1
args.warmup_iterations = 5
iterations = 20
results = self.bmuf_process(args, iterations)
assert len(results) == 1
def assertAlmostEqual(self, t1, t2):
self.assertEqual(t1.size(), t2.size(), "size mismatch")