From 4a1d3a777be8513a3745e0758cc7b87de30c35af Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Wed, 16 May 2018 12:22:01 +0100 Subject: [PATCH] modify files for packaging; thanks to universome --- subword_nmt/__init__.py | 0 subword_nmt/apply_bpe.py | 14 ++-- subword_nmt/bpe_toy.py | 0 subword_nmt/command_line.py | 88 ++++++++++++++++++++++++ subword_nmt/get_vocab.py | 65 +++++++++++++++-- subword_nmt/learn_bpe.py | 18 +++-- subword_nmt/learn_joint_bpe_and_vocab.py | 55 +++++++++------ subword_nmt/segment-char-ngrams.py | 50 ++++++++------ subword_nmt/tests/__init__.py | 0 subword_nmt/tests/test_bpe.py | 4 +- subword_nmt/tests/test_glossaries.py | 0 11 files changed, 233 insertions(+), 61 deletions(-) create mode 100755 subword_nmt/__init__.py mode change 100644 => 100755 subword_nmt/bpe_toy.py create mode 100755 subword_nmt/command_line.py create mode 100755 subword_nmt/tests/__init__.py mode change 100644 => 100755 subword_nmt/tests/test_bpe.py mode change 100644 => 100755 subword_nmt/tests/test_glossaries.py diff --git a/subword_nmt/__init__.py b/subword_nmt/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/subword_nmt/apply_bpe.py b/subword_nmt/apply_bpe.py index b455ccf..c45fe4c 100755 --- a/subword_nmt/apply_bpe.py +++ b/subword_nmt/apply_bpe.py @@ -107,10 +107,16 @@ class BPE(object): for out_segments in isolate_glossary(segment, gloss)] return word_segments -def create_parser(): - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="learn BPE-based word segmentation") +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('apply-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") parser.add_argument( '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, diff --git a/subword_nmt/bpe_toy.py b/subword_nmt/bpe_toy.py old mode 100644 new mode 100755 diff --git a/subword_nmt/command_line.py b/subword_nmt/command_line.py new file mode 100755 index 0000000..70b056e --- /dev/null +++ b/subword_nmt/command_line.py @@ -0,0 +1,88 @@ +import io +import sys +import codecs +import argparse + +from subword_nmt.learn_bpe import learn_bpe +from subword_nmt.apply_bpe import BPE, read_vocabulary +from subword_nmt.get_vocab import get_vocab +from subword_nmt.segment_char_ngrams import segment_char_ngrams +from subword_nmt.learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab + +from subword_nmt.learn_bpe import create_parser as create_learn_bpe_parser +from subword_nmt.apply_bpe import create_parser as create_apply_bpe_parser +from subword_nmt.get_vocab import create_parser as create_get_vocab_parser +from subword_nmt.learn_joint_bpe_and_vocab import create_parser as create_learn_joint_bpe_and_vocab_parser +from subword_nmt.segment_char_ngrams import create_parser as create_segment_char_ngrams_parser + +# hack for python2/3 compatibility +argparse.open = io.open + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="subword-nmt segmentation") + subparsers = parser.add_subparsers(dest='command', help='Command to run') + + learn_bpe_parser = create_learn_bpe_parser(subparsers) + apply_bpe_parser = create_apply_bpe_parser(subparsers) + get_vocab_parser = create_get_vocab_parser(subparsers) + segment_char_ngrams_parser = create_segment_char_ngrams_parser(subparsers) + learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers) + + args = parser.parse_args() + + if args.command == 'learn-bpe': + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) + elif args.command == 'apply-bpe': + # read/write files as UTF-8 + args.codes = codecs.open(args.codes.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + if args.vocabulary: + args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') + + if args.vocabulary: + vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) + else: + vocabulary = None + + bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) + + for line in args.input: + args.output.write(bpe.process_line(line)) + + elif args.command == 'get-vocab': + if args.train_file.name != '': + args.train_file = codecs.open(args.train_file.name, encoding='utf-8') + if args.vocab_file.name != '': + args.vocab_file = codecs.open(args.vocab_file.name, 'w', encoding='utf-8') + get_vocab(args.train_file, args.vocab_file) + elif args.command == 'segment-char-ngrams': + segment_char_ngrams(args) + elif args.command == 'learn-joint-bpe-and-vocab': + learn_joint_bpe_and_vocab(args) + else: + raise Exception('Invalid command provided') + + +if __name__ == '__main__': + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + main() diff --git a/subword_nmt/get_vocab.py b/subword_nmt/get_vocab.py index 25cd174..20efe30 100755 --- a/subword_nmt/get_vocab.py +++ b/subword_nmt/get_vocab.py @@ -3,12 +3,63 @@ from __future__ import print_function import sys from collections import Counter -c = Counter() +# hack for python2/3 compatibility +from io import open +argparse.open = open -for line in sys.stdin: - for word in line.strip('\r\n ').split(' '): - if word: - c[word] += 1 +def create_parser(subparsers=None): -for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): - print(key+" "+ str(f)) + if subparsers: + parser = subparsers.add_parser('get-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + else: + parser = subparsers.argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + + parser.add_argument( + '--train_file', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + + parser.add_argument( + '--vocab_file', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + + return parser + +def get_vocab(train_file, vocab_file): + + c = Counter() + + for line in train_file: + for word in line.strip('\r\n ').split(' '): + if word: + c[word] += 1 + + for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): + vocab_file.write(key+" "+ str(f) + "\n") + +if __name__ == "__main__": + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if args.train_file.name != '': + args.train_file = codecs.open(args.train_file.name, encoding='utf-8') + if args.vocab_file.name != '': + args.vocab_file = codecs.open(args.vocab_file.name, 'w', encoding='utf-8') + + get_vocab(args.train_file, args.vocab_file) \ No newline at end of file diff --git a/subword_nmt/learn_bpe.py b/subword_nmt/learn_bpe.py index 57c4d6f..3328cdf 100755 --- a/subword_nmt/learn_bpe.py +++ b/subword_nmt/learn_bpe.py @@ -24,10 +24,16 @@ from collections import defaultdict, Counter from io import open argparse.open = open -def create_parser(): - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="learn BPE-based word segmentation") +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") parser.add_argument( '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, @@ -188,7 +194,7 @@ def prune_stats(stats, big_stats, threshold): big_stats[item] = freq -def main(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False): +def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False): """Learn num_symbols BPE operations from vocabulary, and write to outfile. """ @@ -252,4 +258,4 @@ if __name__ == '__main__': if args.output.name != '': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') - main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) + learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) diff --git a/subword_nmt/learn_joint_bpe_and_vocab.py b/subword_nmt/learn_joint_bpe_and_vocab.py index a161fb5..f6c4787 100755 --- a/subword_nmt/learn_joint_bpe_and_vocab.py +++ b/subword_nmt/learn_joint_bpe_and_vocab.py @@ -28,10 +28,16 @@ import apply_bpe from io import open argparse.open = open -def create_parser(): - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="learn BPE-based word segmentation") +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-joint-bpe-and-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") parser.add_argument( '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+', @@ -48,7 +54,7 @@ def create_parser(): '--separator', type=str, default='@@', metavar='STR', help="Separator between non-final subword units (default: '%(default)s'))") parser.add_argument( - '--write-vocabulary', type=argparse.FileType('w'), nargs = '+', default=None, + '--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None, metavar='PATH', dest='vocab', help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py') parser.add_argument( @@ -60,22 +66,7 @@ def create_parser(): return parser - - -if __name__ == '__main__': - - # python 2/3 compatibility - if sys.version_info < (3, 0): - sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) - sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) - sys.stdin = codecs.getreader('UTF-8')(sys.stdin) - else: - sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) - sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) - sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) - - parser = create_parser() - args = parser.parse_args() +def learn_joint_bpe_and_vocab(args): if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write('Error: number of input files and vocabulary files must match\n') @@ -95,7 +86,7 @@ if __name__ == '__main__': # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: - learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) + learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) @@ -123,3 +114,23 @@ if __name__ == '__main__': for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close() + + +if __name__ == '__main__': + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + assert(len(args.input) == len(args.vocab)) + + learn_joint_bpe_and_vocab(args) \ No newline at end of file diff --git a/subword_nmt/segment-char-ngrams.py b/subword_nmt/segment-char-ngrams.py index 2a69499..3ccad1a 100755 --- a/subword_nmt/segment-char-ngrams.py +++ b/subword_nmt/segment-char-ngrams.py @@ -12,10 +12,16 @@ import argparse from io import open argparse.open = open -def create_parser(): - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="segment rare words into character n-grams") +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('segment-char-ngrams', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="segment rare words into character n-grams") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="segment rare words into character n-grams") parser.add_argument( '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, @@ -41,6 +47,25 @@ def create_parser(): return parser +def segment_char_ngrams(args): + + vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2] + vocab = dict((y,x) for (x,y) in enumerate(vocab)) + + for line in args.input: + for word in line.split(): + if word not in vocab or vocab[word] > args.shortlist: + i = 0 + while i*args.n < len(word): + args.output.write(word[i*args.n:i*args.n+args.n]) + i += 1 + if i*args.n < len(word): + args.output.write(args.separator) + args.output.write(' ') + else: + args.output.write(word + ' ') + args.output.write('\n') + if __name__ == '__main__': @@ -64,19 +89,4 @@ if __name__ == '__main__': if args.output.name != '': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') - vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2] - vocab = dict((y,x) for (x,y) in enumerate(vocab)) - - for line in args.input: - for word in line.split(): - if word not in vocab or vocab[word] > args.shortlist: - i = 0 - while i*args.n < len(word): - args.output.write(word[i*args.n:i*args.n+args.n]) - i += 1 - if i*args.n < len(word): - args.output.write(args.separator) - args.output.write(' ') - else: - args.output.write(word + ' ') - args.output.write('\n') + segment_char_ngrams(args) \ No newline at end of file diff --git a/subword_nmt/tests/__init__.py b/subword_nmt/tests/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/subword_nmt/tests/test_bpe.py b/subword_nmt/tests/test_bpe.py old mode 100644 new mode 100755 index da3e4a1..d8c8485 --- a/subword_nmt/tests/test_bpe.py +++ b/subword_nmt/tests/test_bpe.py @@ -10,7 +10,7 @@ currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentfram parentdir = os.path.dirname(currentdir) sys.path.insert(0,parentdir) -import learn_bpe +from learn_bpe import learn_bpe from apply_bpe import BPE @@ -19,7 +19,7 @@ class TestBPELearnMethod(unittest.TestCase): def test_learn_bpe(self): infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8') - learn_bpe.main(infile, outfile, 1000) + learn_bpe(infile, outfile, 1000) infile.close() outfile.close() diff --git a/subword_nmt/tests/test_glossaries.py b/subword_nmt/tests/test_glossaries.py old mode 100644 new mode 100755