mirror of
https://github.com/facebookresearch/fairseq.git
synced 2024-10-04 04:37:58 +03:00
Add file to generate manifests for stop dataset. (#4891)
This commit is contained in:
parent
c19aed8ef5
commit
6c57de3c0a
83
examples/audio_nlp/nlu/generate_manifests.py
Normal file
83
examples/audio_nlp/nlu/generate_manifests.py
Normal file
@ -0,0 +1,83 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import soundfile
|
||||
|
||||
def get_insl_frame(parse):
|
||||
out = []
|
||||
def is_ont_token(tok):
|
||||
return tok[0] in ["[", "]"];
|
||||
|
||||
res = []
|
||||
x = []
|
||||
for tok in parse.split():
|
||||
if is_ont_token(tok):
|
||||
res.extend('_'.join(x))
|
||||
x = []
|
||||
res.append(tok.upper())
|
||||
else:
|
||||
x.append(tok.upper())
|
||||
|
||||
return " ".join(res) + ' | '
|
||||
|
||||
def sequencify_utterance(utterance):
|
||||
utterance = utterance.upper()
|
||||
utterance = utterance.replace(' ', '|') + '|'
|
||||
utterance = list(utterance)
|
||||
utterance = ' '.join(utterance)
|
||||
return utterance
|
||||
|
||||
|
||||
def generate_fairseq_manifests(manifest, output_path, audio_root=None):
|
||||
|
||||
with open(manifest, 'r') as i:
|
||||
parses = []
|
||||
utterances = []
|
||||
filepaths = []
|
||||
keys = None
|
||||
for (idx, line) in enumerate(i):
|
||||
if idx == 0: keys = line.strip().split('\t')
|
||||
else:
|
||||
data = { k: v for (k, v) in zip(keys, line.split('\t'))}
|
||||
parses.append(get_insl_frame(data['decoupled_normalized_seqlogical']))
|
||||
utterances.append(sequencify_utterance(data['normalized_utterance']))
|
||||
filepaths.append(data['file_id'])
|
||||
|
||||
parses_fp = output_path.with_suffix('.parse')
|
||||
with open(str(parses_fp), 'w') as o:
|
||||
for p in parses:
|
||||
o.write(p + '\n')
|
||||
|
||||
utterances_fp = output_path.with_suffix('.ltr')
|
||||
with open(str(utterances_fp), 'w') as o:
|
||||
for u in utterances:
|
||||
o.write(u + '\n')
|
||||
|
||||
filepaths_fp = output_path.with_suffix('.tsv')
|
||||
with open(str(filepaths_fp), 'w') as o:
|
||||
o.write(str(audio_root) + '\n')
|
||||
for f in filepaths:
|
||||
fullpath = audio_root / f
|
||||
assert fullpath.exists(), f'{fullpath}'
|
||||
frames = soundfile.info(fullpath).frames
|
||||
o.write(f'{f}\t{frames}\n')
|
||||
|
||||
def main(args):
|
||||
|
||||
splits = ['train', 'eval', 'test']
|
||||
root = Path(args.stop_root)
|
||||
output_root = Path(args.output)
|
||||
|
||||
for split in splits:
|
||||
stop_manifest_path = root / 'manifests' / (split + '.tsv')
|
||||
output_path = output_root / (split)
|
||||
|
||||
generate_fairseq_manifests(stop_manifest_path, output_path, root)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Process some integers.')
|
||||
parser.add_argument('--stop_root', type=str,
|
||||
help='path to stop root directory')
|
||||
parser.add_argument('--output', type=str,
|
||||
help='output directory')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
Loading…
Reference in New Issue
Block a user