From 61162dd24284baebdd407bb7dd4f28892b24fbfb Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Sat, 16 May 2015 17:26:56 +0700 Subject: [PATCH] Fix more Python lint. Most of the complaints fixed here were from Pocketlint, but many were also from Syntastic the vim plugin. --- scripts/ems/support/defaultconfig.py | 73 ++-- scripts/ems/support/mml-filter.py | 251 +++++------ scripts/generic/bsbleu.py | 195 +++++---- scripts/server/moses.py | 396 +++++++++--------- scripts/server/sim-pe.py | 239 ++++++----- scripts/tokenizer/pre_tokenize_cleaning.py | 63 +-- scripts/training/filter-rule-table.py | 44 +- .../training/rdlm/average_null_embedding.py | 24 +- .../training/rdlm/extract_syntactic_ngrams.py | 245 +++++++---- scripts/training/rdlm/extract_vocab.py | 78 ++-- scripts/training/rdlm/train_rdlm.py | 324 ++++++++------ scripts/training/wrappers/conll2mosesxml.py | 113 +++-- .../training/wrappers/mosesxml2brackets.py | 16 +- 13 files changed, 1186 insertions(+), 875 deletions(-) diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py index e88b63e3d..a118e96b3 100644 --- a/scripts/ems/support/defaultconfig.py +++ b/scripts/ems/support/defaultconfig.py @@ -1,53 +1,48 @@ #!/usr/bin/env python2 -# -# Version of ConfigParser which accepts default values -# +"""Version of ConfigParser which accepts default values.""" import ConfigParser class Config: - def __init__(self,filename): - self.config = ConfigParser.SafeConfigParser() - cfh = open(filename) - self.config.readfp(cfh) - cfh.close() + """Version of ConfigParser which accepts default values.""" - def get(self,section,name,default=None): - if default == None or self.config.has_option(section,name): - return self.config.get(section,name) - else: - return default + def __init__(self, filename): + self.config = ConfigParser.SafeConfigParser() + cfh = open(filename) + self.config.readfp(cfh) + cfh.close() - def getint(self,section,name,default=None): - if default == None or self.config.has_option(section,name): - return self.config.getint(section,name) - else: - return default + def get(self, section, name, default=None): + if default is None or self.config.has_option(section, name): + return self.config.get(section, name) + else: + return default + def getint(self, section, name, default=None): + if default is None or self.config.has_option(section, name): + return self.config.getint(section, name) + else: + return default - def getboolean(self,section,name,default=None): - if default == None or self.config.has_option(section,name): - return self.config.getboolean(section,name) - else: - return default - - - def getfloat(self,section,name,default=None): - if default == None or self.config.has_option(section,name): - return self.config.getfloat(section,name) - else: - return default - - - def __str__(self): - ret = "" - for section in self.config.sections(): - for option in self.config.options(section): - ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option)) - return ret - + def getboolean(self, section, name, default=None): + if default is None or self.config.has_option(section, name): + return self.config.getboolean(section, name) + else: + return default + def getfloat(self, section, name, default=None): + if default is None or self.config.has_option(section, name): + return self.config.getfloat(section, name) + else: + return default + def __str__(self): + ret = "" + for section in self.config.sections(): + for option in self.config.options(section): + ret = ret + "%s:%s = %s\n" % ( + section, option, self.config.get(section, option)) + return ret diff --git a/scripts/ems/support/mml-filter.py b/scripts/ems/support/mml-filter.py index 5fb43d71e..8e865c801 100755 --- a/scripts/ems/support/mml-filter.py +++ b/scripts/ems/support/mml-filter.py @@ -1,156 +1,171 @@ #!/usr/bin/env python2 -# -# Filter a parallel corpus -# +"""Filter a parallel corpus.""" + -import heapq import logging -import math import optparse import random -import sys from defaultconfig import Config -logging.basicConfig(format = "%(asctime)-15s %(message)s") + +logging.basicConfig(format="%(asctime)-15s %(message)s") log = logging.getLogger("filter") log.setLevel(logging.DEBUG) -class FilterStrategy(object): - def __init__(self,config): - pass - def filter(self,source,target): - return True +class FilterStrategy(object): + def __init__(self, config): + pass + + def filter(self, source, target): + return True class RandomFilterStrategy(FilterStrategy): - def __init__(self,config): - self.threshold = config.getfloat("random", "threshold", 0.1) - random.seed() + def __init__(self, config): + self.threshold = config.getfloat("random", "threshold", 0.1) + random.seed() - def filter(self, source, target): - return random.random() < self.threshold + def filter(self, source, target): + return random.random() < self.threshold class ScoreFilterStrategy(FilterStrategy): - """Filter strategy that is based on a file with sentence scores. There are three - possible ways of specifying how to filter: - i) threshold - filter all sentence pairs whose score is less than the threshold - ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences + """Filter strategy that is based on a file with sentence scores. + + There are three possible ways of specifying how to filter: + i) threshold - filter all sentence pairs whose score is less than the + threshold. + ii) proportion - filter all but a certain proportion (eg a tenth) of the + sentences. iii) count - filter all but a given count of the sentences. """ - def __init__(self,config): - section = "score" - self.score_file = config.get(section,"score_file") - self.ignore_score = config.get(section, "ignore_score", "99999") - option_names = ("threshold", "proportion", "count") - options = [config.config.has_option(section,o) for o in option_names] - if sum(options) != 1: - raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names)) - if options[0]: - # threshold - self.threshold = config.getfloat(section,option_names[0]) - else: - # proportion or count - if options[2]: - count = config.getint(section,option_names[2]) - else: - # need to count entries - count = 0 - ignore_count = 0 - for line in open(self.score_file): - if line[:-1] != self.ignore_score: - count = count + 1 - else: - ignore_count = ignore_count + 1 - count = int(count * config.getfloat(section,option_names[1])) - log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count)) - # Find the threshold - self.threshold = sorted(\ - [float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count] - #self.threshold = heapq.nlargest(count, \ - # [float(line[:-1]) for line in open(self.score_file)])[-1] + def __init__(self, config): + section = "score" + self.score_file = config.get(section, "score_file") + self.ignore_score = config.get(section, "ignore_score", "99999") + option_names = ("threshold", "proportion", "count") + options = [config.config.has_option(section, o) for o in option_names] + if sum(options) != 1: + raise RuntimeError( + "Must specify exactly one of %s for score filter" + % str(option_names)) + if options[0]: + # Threshold. + self.threshold = config.getfloat(section, option_names[0]) + else: + # proportion or count + if options[2]: + count = config.getint(section, option_names[2]) + else: + # Need to count entries. + count = 0 + ignore_count = 0 + for line in open(self.score_file): + if line[:-1] != self.ignore_score: + count += 1 + else: + ignore_count = ignore_count + 1 + count = int(count * config.getfloat(section, option_names[1])) + log.info( + "Retaining at least %d entries and ignoring %d" + % (count, ignore_count)) + # Find the threshold. + self.threshold = sorted([ + float(line[:-1]) + for line in open(self.score_file)], + reverse=True)[ignore_count + count] + # import heapq + # self.threshold = heapq.nlargest( + # count, + # [float(line[:-1]) for line in open(self.score_file)])[-1] - self.sfh = open(self.score_file) - log.info("Thresholding scores at " + str(self.threshold)) + self.sfh = open(self.score_file) + log.info("Thresholding scores at " + str(self.threshold)) + + def filter(self, source, target): + score = self.sfh.readline() + if not score: + raise RuntimeError("score file truncated") + return ( + score[:-1] == self.ignore_score or + float(score[:-1]) >= self.threshold + ) - def filter(self,source,target): - score = self.sfh.readline() - if not score: - raise RuntimeError("score file truncated") - return score[:-1] == self.ignore_score or float(score[:-1]) >= self.threshold - def main(): - parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file") - (options,args) = parser.parse_args() - if len(args) < 1: - parser.error("No configuration file specified") + parser = optparse.OptionParser(usage="Usage: %prog [options] config-file") + (options, args) = parser.parse_args() + if len(args) < 1: + parser.error("No configuration file specified") - log.info("Loading configuration from " + args[0]) - config = Config(args[0]) - log.debug("Configuration:\n" + str(config)) + log.info("Loading configuration from " + args[0]) + config = Config(args[0]) + log.debug("Configuration:\n" + str(config)) - # Required general parameters - source_lang = config.get("general", "source_language") - target_lang = config.get("general", "target_language") - input_stem = config.get("general", "input_stem") - output_stem = config.get("general", "output_stem") - strategy = config.get("general", "strategy", "") + # Required general parameters + source_lang = config.get("general", "source_language") + target_lang = config.get("general", "target_language") + input_stem = config.get("general", "input_stem") + output_stem = config.get("general", "output_stem") + strategy = config.get("general", "strategy", "") - # Optional general parameters - alignment_stem = config.get("general", "alignment_stem", "") - alignment_type = config.get("general", "alignment_type", "grow-diag-final-and") - domain_file_in = config.get("general", "domain_file", "") - domain_file_out = config.get("general", "domain_file_out", "") + # Optional general parameters + alignment_stem = config.get("general", "alignment_stem", "") + alignment_type = config.get( + "general", "alignment_type", "grow-diag-final-and") + domain_file_in = config.get("general", "domain_file", "") + domain_file_out = config.get("general", "domain_file_out", "") - strategy_class = globals()[strategy + "FilterStrategy"] - strategy = strategy_class(config) + strategy_class = globals()[strategy + "FilterStrategy"] + strategy = strategy_class(config) - source_input_fh = open(input_stem + "." + source_lang) - target_input_fh = open(input_stem + "." + target_lang) - source_output_fh = open(output_stem + "." + source_lang, "w") - target_output_fh = open(output_stem + "." + target_lang, "w") + source_input_fh = open(input_stem + "." + source_lang) + target_input_fh = open(input_stem + "." + target_lang) + source_output_fh = open(output_stem + "." + source_lang, "w") + target_output_fh = open(output_stem + "." + target_lang, "w") - alignment_input_fh = None - alignment_output_fh = None - if alignment_stem: - alignment_input_fh = open(alignment_stem + "." + alignment_type) - alignment_output_fh = open(output_stem + "." + alignment_type,"w") + alignment_input_fh = None + alignment_output_fh = None + if alignment_stem: + alignment_input_fh = open(alignment_stem + "." + alignment_type) + alignment_output_fh = open(output_stem + "." + alignment_type, "w") - domain_boundaries = {} - if domain_file_in: - dfh = open(domain_file_in) - for line in dfh: - line_no,name = line[:-1].split() - domain_boundaries[int(line_no)] = name - - domain_output_fh = None - if domain_file_out: - domain_output_fh = open(domain_file_out, "w") + domain_boundaries = {} + if domain_file_in: + dfh = open(domain_file_in) + for line in dfh: + line_no, name = line[:-1].split() + domain_boundaries[int(line_no)] = name - #log.info(str(domain_boundaries)) + domain_output_fh = None + if domain_file_out: + domain_output_fh = open(domain_file_out, "w") + + # log.info(str(domain_boundaries)) + + retained = 0 + line_no = 0 + for source_line in source_input_fh: + target_line = target_input_fh.readline() + if alignment_input_fh: + align_line = alignment_input_fh.readline() + if strategy.filter(source_line, target_line): + retained = retained + 1 + print>>source_output_fh, source_line, + print>>target_output_fh, target_line, + if alignment_input_fh: + print>>alignment_output_fh, align_line, + line_no = line_no + 1 + # Check if this is a domain boundary. + if domain_boundaries and line_no in domain_boundaries: + print >>domain_output_fh, ( + "%d %s" % (retained, domain_boundaries[line_no])) + log.info("Lines retained: %d", retained) - retained = 0 - line_no = 0 - for source_line in source_input_fh: - target_line = target_input_fh.readline() - if alignment_input_fh: - align_line = alignment_input_fh.readline() - if strategy.filter(source_line,target_line): - retained = retained + 1 - print>>source_output_fh, source_line, - print>>target_output_fh, target_line, - if alignment_input_fh: - print>>alignment_output_fh, align_line, - line_no = line_no + 1 - # check if this is a domain boundary - if domain_boundaries and domain_boundaries.has_key(line_no): - print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no]) - log.info("Lines retained: %d" % retained) if __name__ == "__main__": - main() + main() diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py index ff86fed5e..12d2201de 100755 --- a/scripts/generic/bsbleu.py +++ b/scripts/generic/bsbleu.py @@ -2,73 +2,73 @@ # compute Bleu scores with confidence intervals via boostrap resampling # written by Ulrich Germann -import math,sys,os from argparse import ArgumentParser -from operator import itemgetter -from random import randint -from operator import itemgetter +import math +import os +from random import randint +import sys -def count_ngrams(snt,max_n): + +def count_ngrams(snt, max_n): """ - Return a dictionary of ngram counts (up to length /max_n/) - for sentence (list of words) /snt/. + Return a dictionary of ngram counts (up to length /max_n/) + for sentence (list of words) /snt/. """ ret = {} for i in xrange(len(snt)): - for k in xrange(i+1,min(i+max_n+1,len(snt)+1)): + for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)): key = tuple(snt[i:k]) - ret[key] = ret.get(key,0) + 1 - pass - pass + ret[key] = ret.get(key, 0) + 1 return ret -def max_counts(ng1,ng2): + +def max_counts(ng1, ng2): """ - Return a dicitonary of ngram counts such that + Return a dicitonary of ngram counts such that each count is the greater of the two individual counts - for each ngram in the input ngram count dictionaries + for each ngram in the input ngram count dictionaries /ng1/ and /ng2/. """ ret = ng1.copy() - for k,v in ng2.items(): - ret[k] = max(ret.get(k,0),v) - pass + for k, v in ng2.items(): + ret[k] = max(ret.get(k, 0), v) return ret -def ng_hits(hyp,ref,max_n): + +def ng_hits(hyp, ref, max_n): """ - return a list of ngram counts such that each ngram count - is the minimum of the counts in hyp and ref, up to ngram - length /max_n/ + Return a list of ngram counts such that each ngram count + is the minimum of the counts in hyp and ref, up to ngram + length /max_n/. """ ret = [0 for i in xrange(max_n)] - for ng,cnt in hyp.items(): + for ng, cnt in hyp.items(): k = ng if len(k) <= max_n: - ret[len(k)-1] += min(cnt,ref.get(ng,0)) - pass - pass + ret[len(k) - 1] += min(cnt, ref.get(ng, 0)) return ret + class BleuScore: - def __init__(self,hyp,ref,max_n=4,bootstrap=1000): - # print len(hyp.ngrams),len(ref.ngrams),"X" - self.hits = [ng_hits(hyp.ngrams[i],ref.ngrams[i],max_n) - for i in xrange(len(hyp.ngrams))] - self.max_n = max_n - self.hyp = hyp - self.ref = ref - self.lower = None - self.upper = None + def __init__(self, hyp, ref, max_n=4, bootstrap=1000): + # print len(hyp.ngrams), len(ref.ngrams), "X" + self.hits = [ + ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n) + for i in xrange(len(hyp.ngrams))] + self.max_n = max_n + self.hyp = hyp + self.ref = ref + self.lower = None + self.upper = None self.median = None - self.bootstrap = [self.score([randint(0,len(hyp.snt)-1) for s in hyp.snt]) - for i in xrange(1000)] + self.bootstrap = [ + self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt]) + for i in xrange(1000)] self.bootstrap.sort() self.actual = self.score([i for i in xrange(len(hyp.snt))]) - return - - def score(self,sample): - hits = [0 for i in xrange(self.max_n)] + + def score(self, sample): + hits = [0 for i in xrange(self.max_n)] self.hyplen = 0 self.reflen = 0 for i in sample: @@ -76,94 +76,89 @@ class BleuScore: self.reflen += len(self.ref.snt[i]) for n in xrange(self.max_n): hits[n] += self.hits[i][n] - pass - pass - self.prec = [float(hits[n])/(self.hyplen-n*len(sample)) + self.prec = [float(hits[n]) / (self.hyplen - n * len(sample)) for n in xrange(self.max_n)] - ret = sum([math.log(x) for x in self.prec])/self.max_n - self.BP = min(1,math.exp(1.-float(self.reflen)/float(self.hyplen))) + ret = sum([math.log(x) for x in self.prec]) / self.max_n + self.BP = min( + 1, math.exp(1. - float(self.reflen) / float(self.hyplen))) ret += math.log(self.BP) return math.exp(ret) - + + class Document: - def __init__(self,fname=None): + def __init__(self, fname=None): self.fname = fname if fname: self.snt = [line.strip().split() for line in open(fname)] - self.ngrams = [count_ngrams(snt,4) for snt in self.snt] + self.ngrams = [count_ngrams(snt, 4) for snt in self.snt] else: self.snt = None self.ngrams = None - pass - return - def merge(self,R): + def merge(self, R): self.fname = "multi-ref" self.ngrams = [x for x in R[0].ngrams] self.snt = [x for x in R[0].snt] for i in xrange(len(R[0].ngrams)): - for k in xrange(1,len(R)): - self.ngrams[i] = max_counts(self.ngrams[i],R[k].ngrams[i]) - pass - pass - return + for k in xrange(1, len(R)): + self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i]) - def update(self,hyp,R): - for i in xrange(len(hyp.snt)): - clen = len(hyp.snt[i]) + def update(self, hyp, R): + for i, hyp_snt in enumerate(hyp.snt): + clen = len(hyp_snt) K = 0 - for k in xrange(1,len(R)): - assert len(R[k].snt) == len(hyp.snt),\ - "Mismatch in numer of sentences " +\ - "between reference and candidate" - if abs(len(R[k].snt[i]) - clen) == abs(len(R[K].snt[i]) - clen): - if len(R[k].snt[i]) < len(R[K].snt[i]): + for k in xrange(1, len(R)): + k_snt = R[k].snt[i] + assert len(R[k].snt) == len(hyp.snt), ( + "Mismatch in number of sentences " + + "between reference and candidate") + if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen): + if len(k_snt) < len(R[K].snt[i]): K = k - pass - pass - elif abs(len(R[k].snt[i]) - clen) < abs(len(R[K].snt[i]) - clen): + elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen): K = k - pass - pass self.snt[i] = R[K].snt[i] - pass - return - - pass + if __name__ == "__main__": argparser = ArgumentParser() - argparser.add_argument("-r","--ref",nargs='+',help="reference translation(s)") - argparser.add_argument("-c","--cand",nargs='+',help="candidate translations") - argparser.add_argument("-i","--individual",action='store_true', - help="compute BLEU scores for individual references") - argparser.add_argument("-b","--bootstrap",type=int,default=1000, - help="sample size for bootstrap resampling") - argparser.add_argument("-a","--alpha",help="1-alpha = confidence interval",type=float,default=.05) + argparser.add_argument( + "-r", "--ref", nargs='+', help="Reference translation(s).") + argparser.add_argument( + "-c", "--cand", nargs='+', help="Candidate translations.") + argparser.add_argument( + "-i", "--individual", action='store_true', + help="Compute BLEU scores for individual references.") + argparser.add_argument( + "-b", "--bootstrap", type=int, default=1000, + help="Sample size for bootstrap resampling.") + argparser.add_argument( + "-a", "--alpha", type=float, default=.05, + help="1-alpha = confidence interval.") args = argparser.parse_args(sys.argv[1:]) - R = [ Document(fname) for fname in args.ref] - C = [ Document(fname) for fname in args.cand] - Rx = Document() # for multi-reference BLEU + R = [Document(fname) for fname in args.ref] + C = [Document(fname) for fname in args.cand] + Rx = Document() # for multi-reference BLEU Rx.merge(R) for c in C: # compute multi-reference BLEU - Rx.update(c,R) - bleu = BleuScore(c,Rx,bootstrap=args.bootstrap) - print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s"%\ - (100*bleu.actual, - os.path.basename(Rx.fname), - 100*bleu.bootstrap[int((args.alpha/2)*args.bootstrap)], - 100*bleu.bootstrap[int((1-(args.alpha/2))*args.bootstrap)], - 100*bleu.bootstrap[int(.5*args.bootstrap)], - c.fname) # os.path.basename(c.fname)) + Rx.update(c, R) + bleu = BleuScore(c, Rx, bootstrap=args.bootstrap) + print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % ( + 100 * bleu.actual, + os.path.basename(Rx.fname), + 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)], + 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)], + 100 * bleu.bootstrap[int(.5 * args.bootstrap)], + c.fname) # os.path.basename(c.fname)) if args.individual: for r in R: - bleu = BleuScore(c,r,bootstrap=args.bootstrap) - print " %5.2f %s"%(100*bleu.actual,os.path.basename(r.fname)) - # print bleu.prec,bleu.hyplen,bleu.reflen,bleu.BP - pass - pass + bleu = BleuScore(c, r, bootstrap=args.bootstrap) + print " %5.2f %s" % ( + 100 * bleu.actual, os.path.basename(r.fname)) + # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP - # print [sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) for n in xrange(4)] - pass + # print [ + # sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) + # for n in xrange(4)] diff --git a/scripts/server/moses.py b/scripts/server/moses.py index a176c473a..7cf152187 100644 --- a/scripts/server/moses.py +++ b/scripts/server/moses.py @@ -1,237 +1,225 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Python utilities for moses -# -# This package mostly wraps standard Moses utilities into pipes. -# -# Written by Ulrich Germann -# -# This package borrows from scripts written by Christian Buck -# -# The package assumes that there is a complete moses installation -# (including scripts) under one root directory, -# e.g., via -# bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses -# By default, this root directory is "${HOME}/moses". +""" +Python utilities for moses + +This package mostly wraps standard Moses utilities into pipes. + +Written by Ulrich Germann + +This package borrows from scripts written by Christian Buck + +The package assumes that there is a complete moses installation +(including scripts) under one root directory, +e.g., via :: + bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses +By default, this root directory is "${HOME}/moses". +""" + +import os +import sys +import time +import xmlrpclib +from subprocess import ( + PIPE, + Popen, + ) + + +moses_root = os.environ.get('MOSES_ROOT', os.environ.get('HOME') + "/moses") -import xmlrpclib,datetime,argparse,time,os,sys -from subprocess import * -from unicodedata import normalize - -moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses") class ProcessWrapper: - def __init__(self,cmd=[]): - self.process = None - self.cmd = cmd - return + def __init__(self, cmd=[]): + self.process = None + self.cmd = cmd - def start(self, stdin=PIPE, stdout=PIPE): - if self.process: - raise Exception("Process is already running") - self.process = Popen(self.cmd, stdin = stdin, stdout = stdout) - return + def start(self, stdin=PIPE, stdout=PIPE): + if self.process: + raise Exception("Process is already running") + self.process = Popen(self.cmd, stdin=stdin, stdout=stdout) + + def __del__(self): + if self.process: + self.process.terminate() - def __del__(self): - if self.process: - self.process.terminate() - pass - return - pass class LineProcessor(ProcessWrapper): - def __call__(self,input): - if not self.process: self.start() - self.process.stdin.write("%s\n"%input.strip()) - self.process.stdin.flush() - return self.process.stdout.readline().strip() - pass + def __call__(self, input): + if not self.process: + self.start() + self.process.stdin.write("%s\n" % input.strip()) + self.process.stdin.flush() + return self.process.stdout.readline().strip() + class SentenceSplitter(ProcessWrapper): - """ - Wrapper for standard Moses sentence splitter - """ - def __init__(self,lang): - ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl" - self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang] - self.process = None - return + """Wrapper for standard Moses sentence splitter.""" + + def __init__(self, lang): + ssplit_cmd = moses_root + "/scripts/ems/support/split-sentences.perl" + self.cmd = [ssplit_cmd, "-b", "-q", "-l", lang] + self.process = None + + def __call__(self, input): + if not self.process: + self.start() + self.process.stdin.write(input.strip() + "\n

\n") + self.process.stdin.flush() + x = self.process.stdout.readline().strip() + ret = [] + while x != '

' and x != '': + ret.append(x) + x = self.process.stdout.readline().strip() + return ret - def __call__(self,input): - if not self.process: - self.start() - pass - self.process.stdin.write(input.strip() + "\n

\n") - self.process.stdin.flush() - x = self.process.stdout.readline().strip() - ret = [] - while x != '

' and x != '': - ret.append(x) - x = self.process.stdout.readline().strip() - pass - return ret class Pretokenizer(LineProcessor): - """ - Pretokenizer wrapper; the pretokenizer fixes known issues with the input. - """ - def __init__(self,lang): - pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl" - self.cmd = [pretok_cmd,"-b", "-q", "-l",lang] - self.process = None - return - pass + """Pretokenizer wrapper. + + The pretokenizer fixes known issues with the input. + """ + def __init__(self, lang): + pretok_cmd = moses_root + "/scripts/tokenizer/pre-tokenizer.perl" + self.cmd = [pretok_cmd, "-b", "-q", "-l", lang] + self.process = None + class Tokenizer(LineProcessor): - """ - Tokenizer wrapper; the pretokenizer fixes known issues with the input. - """ - def __init__(self,lang,args=["-a","-no-escape"]): - tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl" - self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args - self.process = None - return - + """Tokenizer wrapper. + + The pretokenizer fixes known issues with the input. + """ + def __init__(self, lang, args=["-a", "-no-escape"]): + tok_cmd = moses_root + "/scripts/tokenizer/tokenizer.perl" + self.cmd = [tok_cmd, "-b", "-q", "-l", lang] + args + self.process = None + + class Truecaser(LineProcessor): - """ - Truecaser wrapper. - """ - def __init__(self,model): - truecase_cmd = moses_root+"/scripts/recaser/truecase.perl" - self.cmd = [truecase_cmd,"-b", "--model",model] - self.process = None - return - pass + """Truecaser wrapper.""" + def __init__(self, model): + truecase_cmd = moses_root + "/scripts/recaser/truecase.perl" + self.cmd = [truecase_cmd, "-b", "--model", model] + self.process = None + class LineProcessorPipeline: - """ - Line processor: one line in, one line out - """ - def __init__(self,parts=[]): - self.chain = [LineProcessor(p.cmd) for p in parts] - return - - def start(self): - if len(self.chain) == 0: - return - if self.chain[0].process: - return - self.chain[0].start() - for i in xrange(1,len(self.chain)): - self.chain[i].start(stdin = self.chain[i-1].process.stdout) - pass - return + """Line processor: one line in, one line out.""" + def __init__(self, parts=[]): + self.chain = [LineProcessor(p.cmd) for p in parts] - def __call__(self,input): - if len(self.chain) == 0: - return input - self.start() - self.chain[0].process.stdin.write("%s\n"%input.strip()) - self.chain[0].process.stdin.flush() - return self.chain[0].process.stdout.readline().strip() + def start(self): + if len(self.chain) == 0: + return + if self.chain[0].process: + return + self.chain[0].start() + for i in xrange(1, len(self.chain)): + self.chain[i].start(stdin=self.chain[i - 1].process.stdout) + + def __call__(self, input): + if len(self.chain) == 0: + return input + self.start() + self.chain[0].process.stdin.write("%s\n" % input.strip()) + self.chain[0].process.stdin.flush() + return self.chain[0].process.stdout.readline().strip() - pass def find_free_port(p): - """ - Find a free port, starting at /p/. - Return the free port, or False if none found. - """ - ret = p - while ret - p < 20: - devnull = open(os.devnull,"w") - n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull) - if n.communicate()[0].find(":%d "%ret) < 0: - return p - ret += 1 - pass - return False + """Find a free port, starting at /p/. + + :return: The free port, or False if none found. + """ + ret = p + while ret - p < 20: + devnull = open(os.devnull, "w") + n = Popen(["netstat", "-tnp"], stdout=PIPE, stderr=devnull) + if n.communicate()[0].find(":%d " % ret) < 0: + return p + ret += 1 + return False + class MosesServer(ProcessWrapper): - def __init__(self,args=[]): - self.process = None - mserver_cmd = moses_root+"/bin/mosesserver" - self.cmd = [mserver_cmd] + args - self.url = None - self.proxy = None - return - - def start(self,config=None,args=[],port=7447,debug=False): - self.cmd.extend(args) - if config: - if "-f" in args: - raise Exception("Config file specified twice") - else: - self.cmd.extend(["-f",config]) - pass - pass - self.port = port # find_free_port(port) - if not self.port: - raise Excpetion("Cannot find free port for moses server!") - self.cmd.extend(["--server-port", "%d"%self.port]) - if debug: - print >>sys.stderr,self.cmd - # self.stderr = open("mserver.%d.stderr"%self.port,'w') - # self.stdout = open("mserver.%d.stdout"%self.port,'w') - # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout) - self.process = Popen(self.cmd) - else: - devnull = open(os.devnull,"w") - self.process = Popen(self.cmd, stderr=devnull, stdout=devnull) - pass - - if self.process.poll(): - raise Exception("FATAL ERROR: Could not launch moses server!") - if debug: - print >>sys.stderr,"MOSES port is %d."%self.port - print >>sys.stderr,"Moses poll status is", self.process.poll() - pass - - self.url = "http://localhost:%d/RPC2"%self.port - self.connect(self.url) - - return True - - def connect(self,url): - if url[:4] != "http": url = "http://%s"%url - if url[-5:] != "/RPC2": url += "/RPC2" - self.url = url - self.proxy = xmlrpclib.ServerProxy(self.url) - return - - def translate(self,input): - attempts = 0 - while attempts < 100: - try: - if type(input) is unicode: - # if the server does not expect unicode, provide a - # properly encoded string! - param = {'text': input.strip().encode('utf8')} - return self.proxy.translate(param)['text'].decode('utf8') - - elif type(input) is str: - param = {'text': input.strip()} - return self.proxy.translate(param)['text'] - - elif type(input) is list: - return [self.translate(x) for x in input] - - elif type(input) is dict: - return self.proxy.translate(input) + def __init__(self, args=[]): + self.process = None + mserver_cmd = moses_root + "/bin/mosesserver" + self.cmd = [mserver_cmd] + args + self.url = None + self.proxy = None + def start(self, config=None, args=[], port=7447, debug=False): + self.cmd.extend(args) + if config: + if "-f" in args: + raise Exception("Config file specified twice") + else: + self.cmd.extend(["-f", config]) + self.port = port # find_free_port(port) + if not self.port: + raise Exception("Cannot find free port for moses server!") + self.cmd.extend(["--server-port", "%d" % self.port]) + if debug: + print >>sys.stderr, self.cmd + # self.stderr = open("mserver.%d.stderr"%self.port,'w') + # self.stdout = open("mserver.%d.stdout"%self.port,'w') + # self.process = Popen( + # self.cmd, stderr=self.stderr, stdout=self.stdout) + self.process = Popen(self.cmd) else: - raise Exception("Can't handle input of this type!") + devnull = open(os.devnull, "w") + self.process = Popen(self.cmd, stderr=devnull, stdout=devnull) - except: - attempts += 1 - print >>sys.stderr, "WAITING", attempts - time.sleep(1) - pass - pass - raise Exception("Translation request failed") - pass + if self.process.poll(): + raise Exception("FATAL ERROR: Could not launch moses server!") + if debug: + print >>sys.stderr, "MOSES port is %d." % self.port + print >>sys.stderr, "Moses poll status is", self.process.poll() + self.url = "http://localhost:%d/RPC2" % self.port + self.connect(self.url) + + return True + + def connect(self, url): + if url[:4] != "http": + url = "http://%s" % url + if url[-5:] != "/RPC2": + url += "/RPC2" + self.url = url + self.proxy = xmlrpclib.ServerProxy(self.url) + + def translate(self, input): + attempts = 0 + while attempts < 100: + try: + if type(input) is unicode: + # If the server does not expect unicode, provide a + # properly encoded string! + param = {'text': input.strip().encode('utf8')} + return self.proxy.translate(param)['text'].decode('utf8') + + elif type(input) is str: + param = {'text': input.strip()} + return self.proxy.translate(param)['text'] + + elif type(input) is list: + return [self.translate(x) for x in input] + + elif type(input) is dict: + return self.proxy.translate(input) + + else: + raise Exception("Can't handle input of this type!") + + except: + attempts += 1 + print >>sys.stderr, "WAITING", attempts + time.sleep(1) + raise Exception("Translation request failed") diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py index 52d1e314a..5f1407524 100755 --- a/scripts/server/sim-pe.py +++ b/scripts/server/sim-pe.py @@ -5,29 +5,39 @@ # This script simulates post-editing of MT output and incrementally # updates the dynamic phrase tables in the moses server. -import xmlrpclib,datetime,argparse,sys,os,time +import argparse +import os +import sys +import time +import xmlrpclib import moses -from moses import MosesServer -from subprocess import * +from subprocess import ( + PIPE, + Popen, + ) + + mserver = moses.MosesServer() # We must perform some custom argument processing, as moses parameter # specifications do not comply with the standards used in standard # argument parsing packages; an isolated double dash separates script # arguments from moses arguments + + def split_args(all_args): """ Split argument list all_args into arguments specific to this script and - arguments relating to the moses server. An isolated double dash acts as - the separator between the two types of arguments. + arguments relating to the moses server. An isolated double dash acts as + the separator between the two types of arguments. """ my_args = [] mo_args = [] arglist = mo_args i = 0 - # IMPORTANT: the code below must be coordinated with + # IMPORTANT: the code below must be coordinated with # - the evolution of moses command line arguments - # - mert-moses.pl + # - mert-moses.pl while i < len(all_args): # print i,"MY_ARGS", my_args # print i,"MO_ARGS", mo_args @@ -36,14 +46,16 @@ def split_args(all_args): elif all_args[i] == "--]": arglist = mo_args elif all_args[i] == "-i" or all_args[i] == "-input-file": - my_args.extend(["-i",all_args[i+1]]) + my_args.extend(["-i", all_args[i + 1]]) i += 1 elif all_args[i] == "-inputtype": - if all_args[i+1] != "0": - # not yet supported! Therefore: - errmsg = "FATAL ERROR: %s "%sys.argv[0] - errmsg += "only supports plain text input at this point." - raise Exception(errsmg) + if all_args[i + 1] != "0": + # Not yet supported! Therefore: + errmsg = ( + "FATAL ERROR: " + "%s only supports plain text input at this point." + % sys.argv[0]) + raise Exception(errmsg) # my_args.extend(["--input-type",all_args[i+1]]) i += 1 elif all_args[i] == "-lattice-samples": @@ -52,13 +64,14 @@ def split_args(all_args): # mo_args[i:i+3] = [] # i += 2 # This is not yet supported! Therefore: - errmsg = "FATAL ERROR: %s "%sys.argv[0] - errmsg += "does not yet support lattice sampling." - raise Exception(errsmg) - + errmsg = ( + "FATAL ERROR: %s does not yet support lattice sampling." + % sys.argv[0]) + raise Exception(errmsg) + elif all_args[i] == "-n-best-list": - my_args.extend(["--nbest",all_args[i+2]]) - my_args.extend(["--nbest-file",all_args[i+1]]) + my_args.extend(["--nbest", all_args[i + 2]]) + my_args.extend(["--nbest-file", all_args[i + 1]]) i += 2 elif all_args[i] == "-n-best-distinct": @@ -70,128 +83,148 @@ def split_args(all_args): i += 1 pass - return my_args,mo_args - + return my_args, mo_args + + def interpret_args(my_args): """ Parse script-specific argument list. """ aparser = argparse.ArgumentParser() - aparser.add_argument("-s","--server-cmd",default="mosesserver", - dest="servercmd", help="path to moses server command") - aparser.add_argument("--url",help="URL of external moses server.") - aparser.add_argument("-p","--port", type=int, default=7447, - help="port number to be used for server") - - # input / output - aparser.add_argument("-i","--input",help="source file",default="-") - aparser.add_argument("-r","--ref",help="reference translation",default=None) - aparser.add_argument("-a","--aln",help="alignment",default=None) - aparser.add_argument("-o","--output",default="-",help="output file") - aparser.add_argument("-d","--debug",action="store_true",help="debug mode") - - # moses reporting options - aparser.add_argument("-A","--with-alignment", dest="A", - help="include alignment in output", action="store_true") - aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G", - help="include search graph info in output") - aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T", - help="include translation options info in output") - aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F", - help="report all factors") - aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0, - help="size of nbest list") - aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0, - help="output file for nbest list") - aparser.add_argument("-u","--nbest-distinct",type=bool,dest="U",default=False, - help="report all factors") + aparser.add_argument( + "-s", "--server-cmd", default="mosesserver", dest="servercmd", + help="Path to moses server command.") + aparser.add_argument( + "--url", help="URL of external moses server.") + aparser.add_argument( + "-p", "--port", type=int, default=7447, + help="Port number to be used for server.") + + # Input / output. + aparser.add_argument( + "-i", "--input", default='-', help="source file") + aparser.add_argument( + "-r", "--ref", default=None, help="Reference translation.") + aparser.add_argument( + "-a", "--aln", default=None, help="Alignment.") + aparser.add_argument( + "-o", "--output", default="-", help="Output file.") + aparser.add_argument( + "-d", "--debug", action='store_true', help="Debug mode.") + + # Moses reporting options. + aparser.add_argument( + "-A", "--with-alignment", dest="A", action='store_true', + help="Include alignment in output.") + aparser.add_argument( + "-G", "--with-graph", type=bool, default=False, dest="G", + help="Include search graph info in output.") + aparser.add_argument( + "-T", "--with-transopt", type=bool, default=False, dest="T", + help="Include translation options info in output.") + aparser.add_argument( + "-F", "--report-all-factors", action="store_true", dest="F", + help="Report all factors.") + aparser.add_argument( + "-n", "--nbest", type=int, dest="nbest", default=0, + help="Size of nbest list.") + aparser.add_argument( + "-N", "--nbest-file", dest="nbestFile", default=0, + help="Output file for nbest list.") + aparser.add_argument( + "-u", "--nbest-distinct", type=bool, dest="U", default=False, + help="Report all factors.") return aparser.parse_args(my_args) - + + def translate(proxy, args, line): if type(line) is unicode: - param = { 'text' : line.strip().encode('utf8') } + param = {'text': line.strip().encode('utf8')} elif type(line) is str: - param = { 'text' : line.strip() } + param = {'text': line.strip()} else: raise Exception("Can't handle input") - if args.A: param['align'] = True - if args.T: param['topt'] = True - if args.F: param['report-all-factors'] = True - if args.nbest: + if args.A: + param['align'] = True + if args.T: + param['topt'] = True + if args.F: + param['report-all-factors'] = True + if args.nbest: param['nbest'] = int(args.nbest) param['add-score-breakdown'] = True pass - if args.U: + if args.U: param['nbest-distinct'] = True pass attempts = 0 while attempts < 20: t1 = time.time() try: - return proxy.translate(param) + return proxy.translate(param) # except xmlrpclib.Fault as e: # except xmlrpclib.ProtocolError as e: # except xmlrpclib.ResponseError as e: except xmlrpclib.Error as e: - time.sleep(2) # give all the stderr stuff a chance to be flushed - print >>sys.stderr," XMLRPC error:",e + sys.stderr.flush() + print >>sys.stderr, " XMLRPC error:", e print >>sys.stderr, "Input was" print >>sys.stderr, param sys.exit(1) except IOError as e: - print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror) + print >>sys.stderr, ( + "I/O error({0}): {1}".format(e.errno, e.strerror)) time.sleep(5) except: serverstatus = mserver.process.poll() - if serverstatus == None: - print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1) + if serverstatus is None: + print >>sys.stderr, ( + "Connection failed after %f seconds" % (time.time() - t1)) attempts += 1 if attempts > 10: time.sleep(10) else: time.sleep(5) - pass else: - - print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\ - %(serverstatus/256,serverstatus%256) + print >>sys.stderr, ( + "Oopsidaisy, server exited with code %d (signal %d)" + % (serverstatus / 256, serverstatus % 256)) pass pass pass raise Exception("Exception: could not reach translation server.") - + def read_data(fname): """ Read and return data (source, target or alignment) from file fname. """ if fname[-3:] == ".gz": - foo = Popen(["zcat",fname],stdout=PIPE)\ - .communicate()[0]\ - .strip().split('\n') + process = Popen(["zcat", fname], stdout=PIPE) + stdout, _ = process.communicate() + foo = stdout.strip().split('\n') else: foo = [x.strip() for x in open(fname).readlines()] - pass return foo -def repack_result(idx,result): + +def repack_result(idx, result): global args if args.nbest: for h in result['nbest']: - fields = [idx,h['hyp'],h['fvals'],h['totalScore']] + fields = [idx, h['hyp'], h['fvals'], h['totalScore']] for i in xrange(len(fields)): if type(fields[i]) is unicode: fields[i] = fields[i].encode('utf-8') pass pass - # print fields - print >>NBestFile,"%d ||| %s ||| %s ||| %f"%tuple(fields) - pass + # Print fields. + print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields) pass if 'align' in result: t = result['text'].split() @@ -200,16 +233,14 @@ def repack_result(idx,result): k = 0 for a in result['align']: k = a['tgt-start'] - if k: print " ".join(t[i:k]).encode('utf8'),span, + if k: + print " ".join(t[i:k]).encode('utf8'), span, i = k - span = "|%d %d|"%(a['src-start'],a['src-end']) - pass - print " ".join(t[k:]).encode('utf8'),span - pass + span = "|%d %d|" % (a['src-start'], a['src-end']) + print " ".join(t[k:]).encode('utf8'), span else: print result['text'].encode('utf8') - pass - return + if __name__ == "__main__": my_args, mo_args = split_args(sys.argv[1:]) @@ -221,17 +252,17 @@ if __name__ == "__main__": args = interpret_args(my_args) if "-show-weights" in mo_args: - # this is for use during tuning, where moses is called to get a list of - # feature names - devnull = open(os.devnull,"w") - mo = Popen(mserver.cmd + mo_args,stdout=PIPE,stderr=devnull) + # This is for use during tuning, where moses is called to get a list + # of feature names. + devnull = open(os.devnull, "w") + mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull) print mo.communicate()[0].strip() sys.exit(0) pass if args.nbest: if args.nbestFile: - NBestFile = open(args.nbestFile,"w") + NBestFile = open(args.nbestFile, "w") else: NBestFile = sys.stdout pass @@ -239,8 +270,10 @@ if __name__ == "__main__": ref = None aln = None - if args.ref: ref = read_data(args.ref) - if args.aln: aln = read_data(args.aln) + if args.ref: + ref = read_data(args.ref) + if args.aln: + aln = read_data(args.aln) if ref and aln: try: @@ -260,25 +293,21 @@ if __name__ == "__main__": line = sys.stdin.readline() idx = 0 while line: - result = translate(mserver.proxy,args,line) - repack_result(idx,result) + result = translate(mserver.proxy, args, line) + repack_result(idx, result) line = sys.stdin.readline() idx += 1 - pass - pass else: src = read_data(args.input) for i in xrange(len(src)): - result = translate(mserver.proxy,args,src[i]) - repack_result(i,result) + result = translate(mserver.proxy, args, src[i]) + repack_result(i, result) if args.debug: print >>sys.stderr, result['text'].encode('utf-8') pass - if ref and aln: - result = mserver.proxy.updater({'source' : src[i], - 'target' : ref[i], - 'alignment' : aln[i]}) - pass - pass - pass - pass + if ref and aln: + result = mserver.proxy.updater({ + 'source': src[i], + 'target': ref[i], + 'alignment': aln[i], + }) diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py index 76736da5c..096a45dc4 100644 --- a/scripts/tokenizer/pre_tokenize_cleaning.py +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -2,12 +2,12 @@ """ The Gacha filter cleans out sentence pairs that have global character mean -lower than a certain threshold. - -Use this cleaner to produce low quantity of high quality sentence pairs. +lower than a certain threshold. -It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during -WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. +Use this cleaner to produce low quantity of high quality sentence pairs. + +It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during +WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. (see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf) This is inspired by the global character mean that is used in the Gale-Church @@ -24,17 +24,24 @@ where: (For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf) """ -import io, subprocess +import io +import subprocess + red = '\033[01;31m' native = '\033[m' + def err_msg(txt): - return red+txt+native + return red + txt + native + def num_char(filename): - return float(subprocess.Popen(["wc", "-m", filename], - stdout=subprocess.PIPE).stdout.read().split()[0]) + process = subprocess.Popen( + ["wc", "-m", filename], stdout=subprocess.PIPE) + # TODO: Was this meant to call communicate()? + return float(process.stdout.read().split()[0]) + def gacha_mean(sourcefile, targetfile): """ @@ -43,36 +50,44 @@ def gacha_mean(sourcefile, targetfile): """ sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n')) c = num_char(sourcefile) / num_char(targetfile) - sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n')) + sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n')) sys.stderr.write(err_msg('Filtering starts ...\n')) return c + +def io_open(path): + """Open file `path` for reading, as a UTF-8 text file.""" + return io.open(path, 'r', encoding='utf8') + + def main(sourcefile, targetfile, threshold=0.2): # Calculates Gacha mean. c = gacha_mean(sourcefile, targetfile) # Calculates lower and upperbound for filtering threshold = float(threshold) - lowerbound = (1-threshold) * c - upperbound = (1+threshold) * c - + lowerbound = (1 - threshold) * c + upperbound = (1 + threshold) * c + # Start filtering sentences. - with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \ - io.open(targetfile, 'r', encoding='utf8') as trgfin: + with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin: for s, t in zip(srcfin, trgfin): if lowerbound < len(s) / float(len(t)) < upperbound: - print(u"{}\t{}\n".format(s.strip(),t.strip())) + print(u"{}\t{}\n".format(s.strip(), t.strip())) + if __name__ == '__main__': import sys - if len(sys.argv) not in range(3,5): - usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n' - % sys.argv[0]) - - example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de ' - '~/Europarl.de-en.en 0.4\n' - % sys.argv[0]) + if len(sys.argv) not in range(3, 5): + usage_msg = err_msg( + "Usage: python %s srcfile trgfile (threshold)\n" + % sys.argv[0]) + + example_msg = err_msg( + "Example: " + "gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n" + % sys.argv[0]) sys.stderr.write(usage_msg) sys.stderr.write(example_msg) sys.exit(1) - + main(*sys.argv[1:]) diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 86c8b300e..14736fe1f 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -24,9 +24,11 @@ import optparse import sys + class NGram(tuple): pass + class Gap: def __init__(self, minSpan): self.minSpan = minSpan @@ -34,8 +36,12 @@ class Gap: def getMinSpan(self): return self.minSpan + def printUsage(): - sys.stderr.write("Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT") + sys.stderr.write( + "Usage: " + "filter-rule-table.py [--min-non-initial-rule-count=N] INPUT") + def main(): parser = optparse.OptionParser() @@ -54,14 +60,15 @@ def main(): inputSentences.append(line.split()) filterRuleTable(sys.stdin, inputSentences, N, options) + def filterRuleTable(ruleTable, inputSentences, N, options): # Map each input n-gram (n = 1..N) to a map from sentence indices to # lists of intra-sentence indices. occurrences = {} for i, sentence in enumerate(inputSentences): - for n in range(1, N+1): - for j in range(0, len(sentence)-n+1): - ngram = NGram(sentence[j:j+n]) + for n in range(1, N + 1): + for j in range(0, len(sentence) - n + 1): + ngram = NGram(sentence[j:j + n]) innerMap = occurrences.setdefault(ngram, {}) indices = innerMap.setdefault(i, []) indices.append(j) @@ -70,15 +77,16 @@ def filterRuleTable(ruleTable, inputSentences, N, options): prevRuleIncluded = None for line in ruleTable: rhs, count = parseRule(line) + below_threshold = (count is not None and count < options.minCount) # Prune non-initial rule if count is below threshold. - if count != None and count < options.minCount and isNonInitialRule(rhs): + if below_threshold and isNonInitialRule(rhs): if prevRHS != rhs: prevRuleIncluded = None prevRHS = rhs continue # If source RHS is same as last rule's then we already know whether to # filter or not (unless it was pruned before checking). - if rhs == prevRHS and prevRuleIncluded != None: + if rhs == prevRHS and prevRuleIncluded is not None: if prevRuleIncluded: print line, continue @@ -89,7 +97,10 @@ def filterRuleTable(ruleTable, inputSentences, N, options): prevRuleIncluded = True continue segments = segmentRHS(rhs, N) - ngramMaps = [occurrences.get(s, {}) for s in segments if isinstance(s, NGram)] + ngramMaps = [ + occurrences.get(s, {}) + for s in segments + if isinstance(s, NGram)] if len(ngramMaps) == 0: print line, prevRuleIncluded = True @@ -111,9 +122,13 @@ def filterRuleTable(ruleTable, inputSentences, N, options): break prevRuleIncluded = match -# Parse a line of the rule table and return a tuple containing two items, -# the list of RHS source symbols and the rule count (if present). + def parseRule(line): + """Parse a line of the rule table. + + :return: A tuple containing two items: the list of RHS source symbols, + and the rule count (if present). + """ cols = line.split("|||") rhsSourceSymbols = cols[0].split()[:-1] ruleCount = None @@ -123,15 +138,18 @@ def parseRule(line): ruleCount = float(counts[2]) return (rhsSourceSymbols, ruleCount) + def isNT(symbol): return symbol[0] == '[' and symbol[-1] == ']' + def isNonInitialRule(rhs): for symbol in rhs: if isNT(symbol): return True return False + def segmentRHS(rhs, N): segments = [] terminals = [] @@ -159,13 +177,14 @@ def segmentRHS(rhs, N): segments.append(NGram(terminals)) return segments + def matchSegments(segments, indexSeq, sentenceLength): assert len(segments) > 0 firstSegment = segments[0] i = 0 if isinstance(firstSegment, Gap): minPos = firstSegment.getMinSpan() - maxPos = sentenceLength-1 + maxPos = sentenceLength - 1 else: minPos = indexSeq[i] + len(firstSegment) i += 1 @@ -175,7 +194,7 @@ def matchSegments(segments, indexSeq, sentenceLength): if minPos + segment.getMinSpan() > sentenceLength: return False minPos = minPos + segment.getMinSpan() - maxPos = sentenceLength-1 + maxPos = sentenceLength - 1 else: pos = indexSeq[i] i += 1 @@ -185,6 +204,7 @@ def matchSegments(segments, indexSeq, sentenceLength): maxPos = minPos return True + def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex): assert len(ngramMaps) > 0 if len(ngramMaps) == 1: @@ -195,7 +215,7 @@ def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex): for index in ngramMaps[0][sentenceIndex]: if index < minFirstIndex: continue - for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index+1): + for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index + 1): assert seq[0] > index yield [index] + seq diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py index cb67c9d75..28abc9508 100755 --- a/scripts/training/rdlm/average_null_embedding.py +++ b/scripts/training/rdlm/average_null_embedding.py @@ -2,18 +2,23 @@ # -*- coding: utf-8 -*- # Author: Rico Sennrich -# average embeddings of special null words for RDLM. -# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL +"""Average embeddings of special null words for RDLM. + +Usage: + average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL +""" import sys import os import numpy + def load_model(model_file): return nplm.NeuralLM.from_file(model_file) + def get_weights(path, vocab, len_context): - d = [[0]*vocab for i in range(len_context)] + d = [[0] * vocab for i in range(len_context)] for line in open(path): for i, word in enumerate(line.split()[:-1]): d[i][int(word)] += 1 @@ -26,20 +31,23 @@ if __name__ == "__main__": training_instances = sys.argv[3] model_output = sys.argv[4] - sys.path.append(os.path.join(nplm_path,'python')) + sys.path.append(os.path.join(nplm_path, 'python')) import nplm model = load_model(model_input) - len_context = len(open(training_instances).readline().split())-1 + len_context = len(open(training_instances).readline().split()) - 1 sys.stderr.write('reading ngrams...') - weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context)) + weights = numpy.array( + get_weights( + training_instances, len(model.input_embeddings), len_context)) sys.stderr.write('done\n') for i in range(len_context): index = model.word_to_index_input[''.format(i)] - model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0) + model.input_embeddings[index] = numpy.average( + numpy.array(model.input_embeddings), weights=weights[i], axis=0) sys.stderr.write('writing model...') - model.to_file(open(model_output,'w')) + model.to_file(open(model_output, 'w')) sys.stderr.write('done\n') diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index f3ce41080..c6d4b7968 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -2,17 +2,25 @@ # -*- coding: utf-8 -*- # Author: Rico Sennrich -# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM -# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py -# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 () +""" +Extract syntactic n-grams from dependency treebank in Moses XML format for +training RDLM. + +Expected format can be produced with +mosesdecoder/scripts/training/wrapper/conll2mosesxml.py + +OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped +to 0 () +""" from __future__ import print_function, unicode_literals, division import sys import codecs import argparse -# hack for python2/3 compatibility +# Hack for python2/3 compatibility from io import open + argparse.open = open try: @@ -20,46 +28,84 @@ try: except ImportError: from xml.etree import cElementTree as ET -def create_parser(): - parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM") - parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', - help='input file (default: standard input).') - parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH', - help='output file (default: standard output).') - parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)', - choices=['label', 'head'], required=True) - parser.add_argument('--vocab', metavar='PATH', type=str, required=True, - help='input layer vocabulary file (one item per line; first line \'\')') - parser.add_argument('--output_vocab', metavar='PATH', type=str, - help='output layer vocabulary file (default: use input layer vocabulary)') - parser.add_argument('--left_context', metavar='INT', type=int, - help='size of context vector for left siblings (default: %(default)s)', default=3) - parser.add_argument('--right_context', metavar='INT', type=int, - help='size of context vector for right siblings (default: %(default)s)', default=0) - parser.add_argument('--up_context', metavar='INT', type=int, - help='size of context vector for ancestors (default: %(default)s)', default=2) - parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q', - help='glue symbol. Will be skipped during extraction (default: %(default)s)') - parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART', - help='sentence start symbol. Will be skipped during extraction (default: %(default)s)') - parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND', - help='sentence end symbol. Will be skipped during extraction (default: %(default)s)') - parser.add_argument('--ptkvz', action='store_true', - help='special rule for German dependency trees: concatenate separable verb prefix and verb') +def create_parser(): + parser = argparse.ArgumentParser( + description=( + "Extract syntactic n-grams from parsed corpus in " + "Moses XML format for training RDLM")) + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help='Input file (default: standard input).') + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help='Output file (default: standard output).') + parser.add_argument( + '--mode', type=str, choices=['label', 'head'], required=True, + help='Predict terminals (head) or dependency labels (label).') + parser.add_argument( + '--vocab', metavar='PATH', type=str, required=True, + help=( + "Input layer vocabulary file (one item per line; " + "first line '')")) + parser.add_argument( + '--output_vocab', metavar='PATH', type=str, + help=( + "Output layer vocabulary file " + "(default: use input layer vocabulary)")) + parser.add_argument( + '--left_context', metavar='INT', type=int, default=3, + help=( + "Size of context vector for left siblings " + "(default: %(default)s)")) + parser.add_argument( + '--right_context', metavar='INT', type=int, default=0, + help=( + "Size of context vector for right siblings " + "(default: %(default)s)")) + parser.add_argument( + '--up_context', metavar='INT', type=int, default=2, + help=( + "Size of context vector for ancestors " + "(default: %(default)s)")) + parser.add_argument( + '--glue_symbol', metavar='STR', type=str, default='Q', + help=( + "Glue symbol. Will be skipped during extraction " + "(default: %(default)s)")) + parser.add_argument( + '--start_symbol', metavar='STR', type=str, default='SSTART', + help=( + "Sentence start symbol. Will be skipped during extraction " + "(default: %(default)s)")) + parser.add_argument( + '--end_symbol', metavar='STR', type=str, default='SEND', + help=( + "Sentence end symbol. Will be skipped during extraction " + "(default: %(default)s)")) + parser.add_argument( + '--ptkvz', action='store_true', + help=( + "Special rule for German dependency trees: " + "concatenate separable verb prefix and verb.")) return parser + def escape_text(s): - s = s.replace('|','|') # factor separator - s = s.replace('[','[') # syntax non-terminal - s = s.replace(']',']') # syntax non-terminal - s = s.replace('\'',''') # xml special character - s = s.replace('"','"') # xml special character + s = s.replace('|', '|') # factor separator + s = s.replace('[', '[') # syntax non-terminal + s = s.replace(']', ']') # syntax non-terminal + s = s.replace('\'', ''') # xml special character + s = s.replace('"', '"') # xml special character return s -# deterministic heuristic to get head of subtree + def get_head(xml, add_ptkvz): + """Deterministic heuristic to get head of subtree.""" head = None preterminal = None for child in xml: @@ -77,23 +123,38 @@ def get_head(xml, add_ptkvz): return head, preterminal -def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None): + +def get_syntactic_ngrams(xml, options, vocab, output_vocab, + parent_heads=None, parent_labels=None): if len(xml): - # skip glue rules - if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol: - for child in xml: - get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) - return + # Skip glue rules. + skip_glue_labels = [ + options.glue_symbol, + options.start_symbol, + options.end_symbo, + ] + if xml.get('label') in skip_glue_labels: + for child in xml: + get_syntactic_ngrams( + child, options, vocab, output_vocab, parent_heads, + parent_labels) + return - # skip virtual nodes - if xml.get('label') == '' or xml.get('label') == '': - return + # Skip virtual nodes. + skip_virtual_labels = [ + '', + '', + ] + if xml.get('label') in skip_virtual_labels: + return if not parent_heads: - parent_heads = [vocab.get('', 0)] * options.up_context - parent_labels = [vocab.get('', 0)] * options.up_context + parent_heads = ( + [vocab.get('', 0)] * options.up_context) + parent_labels = ( + [vocab.get('', 0)] * options.up_context) head, preterminal = get_head(xml, options.ptkvz) if not head: @@ -119,7 +180,8 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p options.output.write(' '.join(map(str, int_list)) + '\n') elif options.mode == 'head' and not head == '': int_list.append(vocab.get(label, 0)) - int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0))) + int_list.append( + output_vocab.get(head, output_vocab.get(preterminal, 0))) options.output.write(' '.join(map(str, int_list)) + '\n') parent_heads.append(vocab.get(head, 0)) @@ -130,28 +192,29 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p if options.right_context: start = ET.Element('tree') start2 = ET.Element('tree') - start.set('label','') - start2.set('label','XY') + start.set('label', '') + start2.set('label', 'XY') start2.text = '' start.append(start2) - xml.insert(0,start) + xml.insert(0, start) if options.left_context: end = ET.Element('tree') end2 = ET.Element('tree') - end.set('label','') - end2.set('label','XY') + end.set('label', '') + end2.set('label', 'XY') end2.text = '' end.append(end2) xml.append(end) - heads = [] preterminals = [] labels = [] for child in xml: if not len(child): - # mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent) + # Mark that the previous sibling is the head of the + # structure (the head/label are not repeated because they're + # also head/label of the parent). head_child = '' preterminal_child = head_child child_label = '' @@ -166,37 +229,60 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p preterminals.append(preterminal_child) labels.append(child_label) - heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))] - labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))] + heads_idx = [ + vocab.get(heads[i], vocab.get(preterminals[i], 0)) + for i in range(len(heads))] + labels_idx = [ + vocab.get(labels[i], 0) + for i in range(len(labels))] - #ancestor context is same for all children + # Ancestor context is the same for all children. up_heads = parent_heads[-options.up_context:] up_labels = parent_labels[-options.up_context:] - for i,child in enumerate(xml): + skip_special_heads = [ + '', + '', + '', + '', + ] + for i, child in enumerate(xml): - # skip some special symbols, but recursively extract n-grams for its children - if options.mode == 'head' and (heads[i] == '' or heads[i] == '' or heads[i] == '' or heads[i] == ''): + # Skip some special symbols, but recursively extract n-grams + # for its children. + if options.mode == 'head' and heads[i] in skip_special_heads: parent_heads.append(vocab.get(heads[i], 0)) parent_labels.append(vocab.get(labels[i], 0)) - get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) + get_syntactic_ngrams( + child, options, vocab, output_vocab, parent_heads, + parent_labels) parent_heads.pop() parent_labels.pop() continue - previous_heads = heads_idx[max(0,i-options.left_context):i] - previous_labels = labels_idx[max(0,i-options.left_context):i] + previous_heads = heads_idx[max(0, i - options.left_context):i] + previous_labels = labels_idx[max(0, i - options.left_context):i] - subsequent_heads = heads_idx[i+1:i+options.right_context+1] - subsequent_labels = labels_idx[i+1:i+options.right_context+1] + subsequent_heads = heads_idx[i + 1:i + options.right_context + 1] + subsequent_labels = labels_idx[i + 1:i + options.right_context + 1] if len(previous_heads) < options.left_context: - previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads - previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels + previous_heads = ( + [start_head_idx] * + (options.left_context - len(previous_heads)) + + previous_heads) + previous_labels = ( + [start_label_idx] * + (options.left_context - len(previous_labels)) + + previous_labels) if len(subsequent_heads) < options.right_context: - subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads)) - subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels)) + subsequent_heads += ( + [stop_head_idx] * + (options.right_context - len(subsequent_heads))) + subsequent_labels += ( + [stop_label_idx] * + (options.right_context - len(subsequent_labels))) int_list = [] int_list.extend(previous_heads) @@ -209,14 +295,19 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p int_list.append(output_vocab.get(labels[i], 0)) elif options.mode == 'head': int_list.append(vocab.get(labels[i], 0)) - int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0))) + int_list.append( + output_vocab.get( + heads[i], output_vocab.get(preterminals[i], 0))) options.output.write(' '.join(map(str, int_list)) + '\n') - parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0))) + parent_heads.append( + vocab.get(heads[i], vocab.get(preterminals[i], 0))) parent_labels.append(vocab.get(labels[i], 0)) - get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels) + get_syntactic_ngrams( + child, options, vocab, output_vocab, parent_heads, + parent_labels) parent_heads.pop() parent_labels.pop() @@ -224,15 +315,17 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p def load_vocab(path): v = {} - for i,line in enumerate(open(path, encoding="UTF-8")): + for i, line in enumerate(open(path, encoding="UTF-8")): v[line.strip()] = i return v + def main(options): vocab = load_vocab(options.vocab) if options.output_vocab is None: - sys.stderr.write('no output vocabulary specified; using input vocabulary\n') + sys.stderr.write( + "No output vocabulary specified; using input vocabulary.\n") output_vocab = vocab else: output_vocab = load_vocab(options.output_vocab) @@ -275,4 +368,4 @@ if __name__ == '__main__': parser = create_parser() options = parser.parse_args() - main(options) \ No newline at end of file + main(options) diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index 6d017602e..ed9266fd9 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -9,6 +9,7 @@ import sys import codecs import argparse from collections import Counter +from textwrap import dedent # hack for python2/3 compatibility from io import open @@ -19,37 +20,49 @@ try: except ImportError: from xml.etree import cElementTree as ET + +HELP_TEXT = dedent("""\ + generate 5 vocabulary files from parsed corpus in moses XML format + [PREFIX].special: around 40 symbols reserved for RDLM + [PREFIX].preterminals: preterminal symbols + [PREFIX].nonterminals: nonterminal symbols (which are not preterminal) + [PREFIX].terminals: terminal symbols + [PREFIX].all: all of the above +""") + + def create_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=HELP_TEXT) - help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n" - help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n"; - help_text += " [PREFIX].preterminals: preterminal symbols\n"; - help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n"; - help_text += " [PREFIX].terminals: terminal symbols\n"; - help_text += " [PREFIX].all: all of the above\n" - - parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text) - - parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', - help='input text (default: standard input).') - parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX', - help='output prefix (default: "vocab")') - parser.add_argument('--ptkvz', action="store_true", - help='special rule for German dependency trees: attach separable verb prefixes to verb') + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + parser.add_argument( + '--output', '-o', type=str, default='vocab', metavar='PREFIX', + help="Output prefix (default: 'vocab')") + parser.add_argument( + '--ptkvz', action="store_true", + help=( + "Special rule for German dependency trees: attach separable " + "verb prefixes to verb.")) return parser -def escape_text(s): - s = s.replace('|','|') # factor separator - s = s.replace('[','[') # syntax non-terminal - s = s.replace(']',']') # syntax non-terminal - s = s.replace('\'',''') # xml special character - s = s.replace('"','"') # xml special character +def escape_text(s): + s = s.replace('|', '|') # factor separator + s = s.replace('[', '[') # syntax non-terminal + s = s.replace(']', ']') # syntax non-terminal + s = s.replace('\'', ''') # xml special character + s = s.replace('"', '"') # xml special character return s -# deterministic heuristic to get head of subtree + def get_head(xml, args): + """Deterministic heuristic to get head of subtree.""" head = None preterminal = None for child in xml: @@ -67,6 +80,7 @@ def get_head(xml, args): return head, preterminal + def get_vocab(xml, args): if len(xml): @@ -88,6 +102,7 @@ def get_vocab(xml, args): continue get_vocab(child, args) + def main(args): global heads @@ -111,10 +126,24 @@ def main(args): get_vocab(xml, args) i += 1 - special_tokens = ['', '', '', '', '', '', '', '', '', '', '', '', ''] + special_tokens = [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] for i in range(30): - special_tokens.append(''.format(i)) + special_tokens.append(''.format(i)) f = open(args.output + '.special', 'w', encoding='UTF-8') for item in special_tokens: @@ -158,7 +187,6 @@ def main(args): f.close() - if __name__ == '__main__': if sys.version_info < (3, 0): diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 15e56c430..ae57e8dfc 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -9,7 +9,6 @@ import subprocess import sys import os import codecs -import copy # ../bilingual-lm sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm')) @@ -17,143 +16,224 @@ import train_nplm import extract_vocab import extract_syntactic_ngrams -logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) +logging.basicConfig( + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = argparse.ArgumentParser() -parser.add_argument("--working-dir", dest="working_dir", metavar="PATH") -parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file") -parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True) -parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)") -parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)") -parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)") -parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)") -parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True) -parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)") -parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)") -parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)") -parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)") -parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)") -parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)") -parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)") -parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)") -parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH") -parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)") -parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)") -parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)") -parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)") -parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)") -parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)") -parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)") -parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") +parser.add_argument( + "--working-dir", dest="working_dir", metavar="PATH") +parser.add_argument( + "--corpus", dest="corpus_stem", metavar="PATH", help="Input file.") +parser.add_argument( + "--nplm-home", dest="nplm_home", metavar="PATH", required=True, + help="Location of NPLM.") +parser.add_argument( + "--epochs", dest="epochs", type=int, metavar="INT", + help="Number of training epochs (default: %(default)s).") +parser.add_argument( + "--up-context-size", dest="up_context_size", type=int, metavar="INT", + help="Size of ancestor context (default: %(default)s).") +parser.add_argument( + "--left-context-size", dest="left_context_size", type=int, metavar="INT", + help="Size of sibling context (left) (default: %(default)s).") +parser.add_argument( + "--right-context-size", dest="right_context_size", type=int, + metavar="INT", + help="Size of sibling context (right) (default: %(default)s).") +parser.add_argument( + "--mode", dest="mode", choices=['head', 'label'], required=True, + help="Type of RDLM to train (both are required for decoding).") +parser.add_argument( + "--minibatch-size", dest="minibatch_size", type=int, metavar="INT", + help="Minibatch size (default: %(default)s).") +parser.add_argument( + "--noise", dest="noise", type=int, metavar="INT", + help="Number of noise samples for NCE (default: %(default)s).") +parser.add_argument( + "--hidden", dest="hidden", type=int, metavar="INT", + help=( + "Size of hidden layer (0 for single hidden layer) " + "(default: %(default)s)")) +parser.add_argument( + "--input-embedding", dest="input_embedding", type=int, metavar="INT", + help="Size of input embedding layer (default: %(default)s).") +parser.add_argument( + "--output-embedding", dest="output_embedding", type=int, metavar="INT", + help="Size of output embedding layer (default: %(default)s).") +parser.add_argument( + "--threads", "-t", dest="threads", type=int, metavar="INT", + help="Number of threads (default: %(default)s).") +parser.add_argument( + "--output-model", dest="output_model", metavar="PATH", + help="Name of output model (default: %(default)s).") +parser.add_argument( + "--output-dir", dest="output_dir", metavar="PATH", + help="Output directory (default: same as working-dir).") +parser.add_argument( + "--config-options-file", dest="config_options_file", metavar="PATH") +parser.add_argument( + "--log-file", dest="log_file", metavar="PATH", + help="Log file to write to (default: %(default)s).") +parser.add_argument( + "--validation-corpus", dest="validation_corpus", metavar="PATH", + help="Validation file (default: %(default)s).") +parser.add_argument( + "--activation-function", dest="activation_fn", + choices=['identity', 'rectifier', 'tanh', 'hardtanh'], + help="Activation function (default: %(default)s).") +parser.add_argument( + "--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", + help="Learning rate (default: %(default)s).") +parser.add_argument( + "--input-words-file", dest="input_words_file", metavar="PATH", + help="Input vocabulary (default: %(default)s).") +parser.add_argument( + "--output-words-file", dest="output_words_file", metavar="PATH", + help="Output vocabulary (default: %(default)s).") +parser.add_argument( + "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", + help="Input vocabulary size (default: %(default)s).") +parser.add_argument( + "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", + help="Output vocabulary size (default: %(default)s).") parser.set_defaults( - working_dir = "working" - ,corpus_stem = "train" - ,nplm_home = "/home/bhaddow/tools/nplm" - ,epochs = 2 - ,up_context_size = 2 - ,left_context_size = 3 - ,right_context_size = 0 - ,minibatch_size=1000 - ,noise=100 - ,hidden=0 - ,mode='head' - ,input_embedding=150 - ,output_embedding=750 - ,threads=4 - ,output_model = "train" - ,output_dir = None - ,config_options_file = "config" - ,log_file = "log" - ,validation_corpus = None - ,activation_fn = "rectifier" - ,learning_rate = 1 - ,input_words_file = None - ,output_words_file = None - ,input_vocab_size = 500000 - ,output_vocab_size = 500000 - ) + working_dir="working", + corpus_stem="train", + nplm_home="/home/bhaddow/tools/nplm", + epochs=2, + up_context_size=2, + left_context_size=3, + right_context_size=0, + minibatch_size=1000, + noise=100, + hidden=0, + mode='head', + input_embedding=150, + output_embedding=750, + threads=4, + output_model="train", + output_dir=None, + config_options_file="config", + log_file="log", + validation_corpus=None, + activation_fn="rectifier", + learning_rate=1, + input_words_file=None, + output_words_file=None, + input_vocab_size=500000, + output_vocab_size=500000) + def prepare_vocabulary(options): - vocab_prefix = os.path.join(options.working_dir, 'vocab') - extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix]) - extract_vocab.main(extract_vocab_options) + vocab_prefix = os.path.join(options.working_dir, 'vocab') + extract_vocab_options = extract_vocab.create_parser().parse_args( + ['--input', options.corpus_stem, '--output', vocab_prefix]) + extract_vocab.main(extract_vocab_options) - if options.input_words_file is None: - options.input_words_file = vocab_prefix + '.input' - orig = vocab_prefix + '.all' - filtered_vocab = open(orig).readlines() - if options.input_vocab_size: - filtered_vocab = filtered_vocab[:options.input_vocab_size] - open(options.input_words_file,'w').writelines(filtered_vocab) + if options.input_words_file is None: + options.input_words_file = vocab_prefix + '.input' + orig = vocab_prefix + '.all' + filtered_vocab = open(orig).readlines() + if options.input_vocab_size: + filtered_vocab = filtered_vocab[:options.input_vocab_size] + open(options.input_words_file, 'w').writelines(filtered_vocab) + + if options.output_words_file is None: + options.output_words_file = vocab_prefix + '.output' + if options.mode == 'label': + blacklist = [ + ' output_file +Note that the structure is built based on fields 9 and 10 (projective HEAD +and RELATION), which not all parsers produce. + +Usage: conll2mosesxml.py [--brackets] < input_file > output_file +""" from __future__ import print_function, unicode_literals import sys import re import codecs -from collections import namedtuple,defaultdict +from collections import ( + namedtuple, + defaultdict, + ) from lxml import etree as ET -Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func']) +Word = namedtuple( + 'Word', + ['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func']) + def main(output_format='xml'): sentence = [] for line in sys.stdin: - # process sentence + # Process sentence. if line == "\n": - sentence.insert(0,[]) + sentence.insert(0, []) if is_projective(sentence): - write(sentence,output_format) + write(sentence, output_format) else: - sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n') + sys.stderr.write( + ' '.join(w.word for w in sentence[1:]) + '\n') sys.stdout.write('\n') sentence = [] continue try: - pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split() - except ValueError: # word may be unicode whitespace - pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip()) + ( + pos, + word, + lemma, + tag, + tag2, + morph, + head, + func, + proj_head, + proj_func, + ) = line.split() + except ValueError: # Word may be unicode whitespace. + ( + pos, + word, + lemma, + tag, + tag2, + morph, + head, + func, + proj_head, + proj_func, + ) = re.split(' *\t*', line.strip()) word = escape_special_chars(word) lemma = escape_special_chars(lemma) @@ -46,17 +80,20 @@ def main(output_format='xml'): proj_head = head proj_func = func - sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func)) + sentence.append( + Word( + int(pos), word, lemma, tag2, int(head), func, int(proj_head), + proj_func)) -# this script performs the same escaping as escape-special-chars.perl in Moses. -# most of it is done in function write(), but quotation marks need to be processed first +# This script performs the same escaping as escape-special-chars.perl in +# Moses. Most of it is done in function write(), but quotation marks need +# to be processed first. def escape_special_chars(line): - - line = line.replace('\'',''') # xml - line = line.replace('"','"') # xml - line = line.replace('[','[') # syntax non-terminal - line = line.replace(']',']') # syntax non-terminal + line = line.replace('\'', ''') # xml + line = line.replace('"', '"') # xml + line = line.replace('[', '[') # syntax non-terminal + line = line.replace(']', ']') # syntax non-terminal return line @@ -64,7 +101,7 @@ def escape_special_chars(line): # make a check if structure is projective def is_projective(sentence): dominates = defaultdict(set) - for i,w in enumerate(sentence): + for i, w in enumerate(sentence): dominates[i].add(i) if not i: continue @@ -77,7 +114,7 @@ def is_projective(sentence): for i in dominates: dependents = dominates[i] - if max(dependents) - min(dependents) != len(dependents)-1: + if max(dependents) - min(dependents) != len(dependents) - 1: sys.stderr.write("error: non-projective structure.\n") return False return True @@ -86,24 +123,28 @@ def is_projective(sentence): def write(sentence, output_format='xml'): if output_format == 'xml': - tree = create_subtree(0,sentence) - out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8') + tree = create_subtree(0, sentence) + out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8') if output_format == 'brackets': - out = create_brackets(0,sentence) + out = create_brackets(0, sentence) - out = out.replace('|','|') # factor separator + out = out.replace('|', '|') # factor separator - out = out.replace('&apos;',''') # lxml is buggy if input is escaped - out = out.replace('&quot;','"') # lxml is buggy if input is escaped - out = out.replace('&#91;','[') # lxml is buggy if input is escaped - out = out.replace('&#93;',']') # lxml is buggy if input is escaped + # lxml is buggy if input is escaped: + out = out.replace('&apos;', ''') + # lxml is buggy if input is escaped: + out = out.replace('&quot;', '"') + # lxml is buggy if input is escaped: + out = out.replace('&#91;', '[') + # lxml is buggy if input is escaped: + out = out.replace('&#93;', ']') print(out) -# write node in Moses XML format -def create_subtree(position, sentence): +def create_subtree(position, sentence): + """"Write node in Moses XML format.""" element = ET.Element('tree') if position: @@ -111,7 +152,7 @@ def create_subtree(position, sentence): else: element.set('label', 'sent') - for i in range(1,position): + for i in range(1, position): if sentence[i].proj_head == position: element.append(create_subtree(i, sentence)) @@ -144,7 +185,7 @@ def create_brackets(position, sentence): else: element = "[ sent " - for i in range(1,position): + for i in range(1, position): if sentence[i].proj_head == position: element += create_brackets(i, sentence) @@ -167,7 +208,7 @@ def create_brackets(position, sentence): return element if __name__ == '__main__': - if sys.version_info < (3,0,0): + if sys.version_info < (3, 0, 0): sys.stdin = codecs.getreader('UTF-8')(sys.stdin) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py index bd876f087..6ff1d20c9 100755 --- a/scripts/training/wrappers/mosesxml2brackets.py +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -10,17 +10,21 @@ import codecs from lxml import etree as ET + def escape(word): - word = word.replace('|','|') # factor separator - word = word.replace('[','[') # syntax non-terminal - word = word.replace(']',']') # syntax non-terminal - word = word.replace('\'',''') - word = word.replace('\"','"') + # Factor separator: + word = word.replace('|', '|') + # Syntax non-terminal: + word = word.replace('[', '[') + # Syntax non-terminal: + word = word.replace(']', ']') + word = word.replace('\'', ''') + word = word.replace('\"', '"') return word -def make_brackets(xml): +def make_brackets(xml): out = ' [' + xml.get('label') if xml.text and xml.text.strip():