Support multiple regression targets in sentence prediction (#1831)

Summary:
# Before submitting

- [x] Was this discussed/approved via a Github issue? (no need for typos, doc improvements)
- [x] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)?
- [x] Did you make sure to update the docs?
- [x] Did you write any new necessary tests?

## What does this PR do?
Fixes https://github.com/pytorch/fairseq/issues/1830
Adds tests for RoBERTa (masked_lm, classification, single regression, multiple regression)
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1831

Reviewed By: ngoyal2707

Differential Revision: D20446010

Pulled By: myleott

fbshipit-source-id: 9f37bcedf0910d85446245d71bc234bc74c62da5
This commit is contained in:
David Příhoda 2020-03-21 16:52:53 -07:00 committed by Facebook GitHub Bot
parent bee6d71646
commit 42f65d6577
4 changed files with 138 additions and 9 deletions

View File

@ -28,12 +28,20 @@ class MaskedLmLoss(FairseqCriterion):
"""
# compute MLM loss
masked_tokens = sample['target'].ne(self.padding_idx)
masked_tokens = torch.where(
# (rare case) when all tokens are masked, project all tokens
masked_tokens.any(),
masked_tokens,
masked_tokens.new([True]),
)
# Rare: when all tokens are masked, project all tokens.
# We use torch.where to avoid device-to-host transfers,
# except on CPU where torch.where is not well supported
# (see github.com/pytorch/pytorch/issues/26247).
if masked_tokens.device == torch.device('cpu'):
if not masked_tokens.any():
masked_tokens.fill_(True)
else:
masked_tokens = torch.where(
masked_tokens.any(),
masked_tokens,
masked_tokens.new([True]),
)
logits = model(**sample['net_input'], masked_tokens=masked_tokens)[0]
targets = model.get_targets(sample, [logits])

View File

@ -56,7 +56,7 @@ class SentencePredictionCriterion(FairseqCriterion):
reduction='sum',
)
else:
logits = logits.squeeze().float()
logits = logits.view(-1).float()
targets = targets.float()
loss = F.mse_loss(
logits,

View File

@ -46,7 +46,7 @@ class SentencePredictionTask(FairseqTask):
parser.add_argument('data', metavar='FILE',
help='file prefix for data')
parser.add_argument('--num-classes', type=int, default=-1,
help='number of classes')
help='number of classes or regression targets')
parser.add_argument('--init-token', type=int, default=None,
help='add token at the beginning of each batch item')
parser.add_argument('--separator-token', type=int, default=None,
@ -181,9 +181,14 @@ class SentencePredictionTask(FairseqTask):
else:
label_path = "{0}.label".format(get_path('label', split))
if os.path.exists(label_path):
def parse_regression_target(i, line):
values = line.split()
assert len(values) == self.args.num_classes, \
f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
return [float(x) for x in values]
dataset.update(
target=RawLabelDataset([
float(x.strip()) for x in open(label_path).readlines()
parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines())
])
)

View File

@ -536,6 +536,38 @@ class TestMaskedLanguageModel(unittest.TestCase):
preprocess_lm_data(data_dir)
train_legacy_masked_language_model(data_dir, "masked_lm")
def test_roberta_masked_lm(self):
with contextlib.redirect_stdout(StringIO()):
with tempfile.TemporaryDirectory("test_roberta_mlm") as data_dir:
create_dummy_data(data_dir)
preprocess_lm_data(data_dir)
train_masked_lm(data_dir, "roberta_base")
def test_roberta_sentence_prediction(self):
num_classes = 3
with contextlib.redirect_stdout(StringIO()):
with tempfile.TemporaryDirectory("test_roberta_head") as data_dir:
create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
preprocess_lm_data(os.path.join(data_dir, 'input0'))
preprocess_lm_data(os.path.join(data_dir, 'label'))
train_roberta_head(data_dir, "roberta_base", num_classes=num_classes)
def test_roberta_regression_single(self):
num_classes = 1
with contextlib.redirect_stdout(StringIO()):
with tempfile.TemporaryDirectory("test_roberta_regression_single") as data_dir:
create_dummy_roberta_head_data(data_dir, num_classes=num_classes, regression=True)
preprocess_lm_data(os.path.join(data_dir, 'input0'))
train_roberta_head(data_dir, "roberta_base", num_classes=num_classes, extra_flags=['--regression-target'])
def test_roberta_regression_multiple(self):
num_classes = 3
with contextlib.redirect_stdout(StringIO()):
with tempfile.TemporaryDirectory("test_roberta_regression_multiple") as data_dir:
create_dummy_roberta_head_data(data_dir, num_classes=num_classes, regression=True)
preprocess_lm_data(os.path.join(data_dir, 'input0'))
train_roberta_head(data_dir, "roberta_base", num_classes=num_classes, extra_flags=['--regression-target'])
def _test_pretrained_masked_lm_for_translation(self, learned_pos_emb, encoder_only):
with contextlib.redirect_stdout(StringIO()):
with tempfile.TemporaryDirectory("test_mlm") as data_dir:
@ -758,6 +790,41 @@ def create_dummy_data(data_dir, num_examples=100, maxlen=20, alignment=False):
_create_dummy_alignment_data('valid.in', 'valid.out', 'valid.align')
_create_dummy_alignment_data('test.in', 'test.out', 'test.align')
def create_dummy_roberta_head_data(data_dir, num_examples=100, maxlen=10, num_classes=2, regression=False):
input_dir = 'input0'
def _create_dummy_data(filename):
random_data = torch.rand(num_examples * maxlen)
input_data = 97 + torch.floor(26 * random_data).int()
if regression:
output_data = torch.rand((num_examples, num_classes))
else:
output_data = 1 + torch.floor(num_classes * torch.rand(num_examples)).int()
with open(os.path.join(data_dir, input_dir, filename+'.out'), 'w') as f_in:
label_filename = filename+'.label' if regression else filename+'.out'
with open(os.path.join(data_dir, 'label', label_filename), 'w') as f_out:
offset = 0
for i in range(num_examples):
# write example input
ex_len = random.randint(1, maxlen)
ex_str = ' '.join(map(chr, input_data[offset:offset+ex_len]))
print(ex_str, file=f_in)
# write example label
if regression:
class_str = ' '.join(map(str, output_data[i].numpy()))
print(class_str, file=f_out)
else:
class_str = 'class{}'.format(output_data[i])
print(class_str, file=f_out)
offset += ex_len
os.mkdir(os.path.join(data_dir, input_dir))
os.mkdir(os.path.join(data_dir, 'label'))
_create_dummy_data('train')
_create_dummy_data('valid')
_create_dummy_data('test')
def preprocess_translation_data(data_dir, extra_flags=None):
preprocess_parser = options.get_preprocessing_parser()
preprocess_args = preprocess_parser.parse_args(
@ -861,6 +928,55 @@ def preprocess_lm_data(data_dir):
preprocess.main(preprocess_args)
def train_masked_lm(data_dir, arch, extra_flags=None):
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(
train_parser,
[
'--task', 'masked_lm',
data_dir,
'--arch', arch,
'--optimizer', 'adam',
'--lr', '0.0001',
'--criterion', 'masked_lm',
'--max-sentences', '500',
'--save-dir', data_dir,
'--max-epoch', '1',
'--no-progress-bar',
'--distributed-world-size', '1',
'--ddp-backend', 'no_c10d',
'--num-workers', 0,
] + (extra_flags or []),
)
train.main(train_args)
def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None):
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(
train_parser,
[
'--task', 'sentence_prediction',
data_dir,
'--arch', arch,
'--num-classes', str(num_classes),
'--optimizer', 'adam',
'--lr', '0.0001',
'--criterion', 'sentence_prediction',
'--max-tokens', '500',
'--max-positions', '500',
'--max-sentences', '500',
'--save-dir', data_dir,
'--max-epoch', '1',
'--no-progress-bar',
'--distributed-world-size', '1',
'--ddp-backend', 'no_c10d',
'--num-workers', 0,
] + (extra_flags or []),
)
train.main(train_args)
def train_language_model(data_dir, arch, extra_flags=None, run_validation=False):
train_parser = options.get_training_parser()
train_args = options.parse_args_and_arch(