From 71174062d6365ef1ecc41193b3c905c400215b76 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Tue, 2 Nov 2021 18:38:28 +0200 Subject: [PATCH] fix vocab yaml script added --- NOTES.md | 4 ++++ scripts/fix_vocab.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100755 scripts/fix_vocab.py diff --git a/NOTES.md b/NOTES.md index d7cc0475..38dde1a0 100644 --- a/NOTES.md +++ b/NOTES.md @@ -1,4 +1,8 @@ +# Integration + +https://github.com/UKPLab/EasyNMT + # more efficient parallelisation diff --git a/scripts/fix_vocab.py b/scripts/fix_vocab.py new file mode 100755 index 00000000..c6965880 --- /dev/null +++ b/scripts/fix_vocab.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +#-*-python-*- + +import yaml +import sys +from shutil import copyfile + + +filename = sys.argv[1] + +try: + input = open(filename, 'r') + yaml.load(input) +except: + print('YAML file is broken - try to fix it!') + print(f'copy {filename} to {filename}.bak') + copyfile(filename, f'{filename}.bak') + + vocab={} + # for line in sys.stdin: + with open(filename) as fh: + for line in fh: + parts = line.rstrip().split(': ') + parts[0] = parts[0][1:-1] + vocab[parts[0]] = int(parts[1]) + + + print(f'write a new version of {filename}') + output = open(filename, 'w') + yaml.dump(vocab, output, allow_unicode=True)