Add file to generate manifests for stop dataset. (#4891)

2024-10-04 04:37:58 +03:00 · 2022-12-14 09:53:29 -08:00 · 2022-12-14 09:53:29 -08:00 · 6c57de3c0a
commit 6c57de3c0a
parent c19aed8ef5
1 changed files with 83 additions and 0 deletions
--- a/examples/audio_nlp/nlu/generate_manifests.py
+++ b/examples/audio_nlp/nlu/generate_manifests.py
@ -0,0 +1,83 @@
+import argparse
+from pathlib import Path
+import soundfile
+
+def get_insl_frame(parse):
+    out = []
+    def is_ont_token(tok):
+        return tok[0] in ["[", "]"];
+
+    res = []
+    x = []
+    for tok in parse.split():
+        if is_ont_token(tok):
+            res.extend('_'.join(x))
+            x = []
+            res.append(tok.upper())
+        else:
+            x.append(tok.upper())
+
+    return " ".join(res) + ' | '
+
+def sequencify_utterance(utterance):
+    utterance = utterance.upper()
+    utterance = utterance.replace(' ', '|') + '|'
+    utterance = list(utterance)
+    utterance = ' '.join(utterance)
+    return utterance
+
+
+def generate_fairseq_manifests(manifest, output_path, audio_root=None):
+
+    with open(manifest, 'r') as i:
+        parses = []
+        utterances = []
+        filepaths = []
+        keys = None
+        for (idx, line) in enumerate(i):
+            if idx == 0: keys = line.strip().split('\t')
+            else:
+                data = { k: v for (k, v) in zip(keys, line.split('\t'))}
+                parses.append(get_insl_frame(data['decoupled_normalized_seqlogical']))
+                utterances.append(sequencify_utterance(data['normalized_utterance']))
+                filepaths.append(data['file_id'])
+
+    parses_fp = output_path.with_suffix('.parse')
+    with open(str(parses_fp), 'w') as o:
+        for p in parses:
+            o.write(p + '\n')
+
+    utterances_fp = output_path.with_suffix('.ltr')
+    with open(str(utterances_fp), 'w') as o:
+        for u in utterances:
+            o.write(u + '\n')
+
+    filepaths_fp = output_path.with_suffix('.tsv')
+    with open(str(filepaths_fp), 'w') as o:
+        o.write(str(audio_root) + '\n')
+        for f in filepaths:
+            fullpath = audio_root / f
+            assert fullpath.exists(), f'{fullpath}'
+            frames = soundfile.info(fullpath).frames
+            o.write(f'{f}\t{frames}\n')
+
+def main(args):
+
+    splits = ['train', 'eval', 'test']
+    root = Path(args.stop_root)
+    output_root = Path(args.output)
+
+    for split in splits:
+        stop_manifest_path = root / 'manifests' / (split + '.tsv')
+        output_path = output_root / (split)
+
+        generate_fairseq_manifests(stop_manifest_path, output_path, root)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--stop_root', type=str,
+                    help='path to stop root directory')
+    parser.add_argument('--output', type=str,
+                    help='output directory')
+    args = parser.parse_args()
+    main(args)