Fix more Python lint.

Most of the complaints fixed here were from Pocketlint, but many were also
from Syntastic the vim plugin.
This commit is contained in:
Jeroen Vermeulen 2015-05-16 17:26:56 +07:00
parent c07ade8142
commit 61162dd242
13 changed files with 1186 additions and 875 deletions

View File

@ -1,53 +1,48 @@
#!/usr/bin/env python2
#
# Version of ConfigParser which accepts default values
#
"""Version of ConfigParser which accepts default values."""
import ConfigParser
class Config:
def __init__(self,filename):
self.config = ConfigParser.SafeConfigParser()
cfh = open(filename)
self.config.readfp(cfh)
cfh.close()
"""Version of ConfigParser which accepts default values."""
def get(self,section,name,default=None):
if default == None or self.config.has_option(section,name):
return self.config.get(section,name)
else:
return default
def __init__(self, filename):
self.config = ConfigParser.SafeConfigParser()
cfh = open(filename)
self.config.readfp(cfh)
cfh.close()
def getint(self,section,name,default=None):
if default == None or self.config.has_option(section,name):
return self.config.getint(section,name)
else:
return default
def get(self, section, name, default=None):
if default is None or self.config.has_option(section, name):
return self.config.get(section, name)
else:
return default
def getint(self, section, name, default=None):
if default is None or self.config.has_option(section, name):
return self.config.getint(section, name)
else:
return default
def getboolean(self,section,name,default=None):
if default == None or self.config.has_option(section,name):
return self.config.getboolean(section,name)
else:
return default
def getfloat(self,section,name,default=None):
if default == None or self.config.has_option(section,name):
return self.config.getfloat(section,name)
else:
return default
def __str__(self):
ret = ""
for section in self.config.sections():
for option in self.config.options(section):
ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option))
return ret
def getboolean(self, section, name, default=None):
if default is None or self.config.has_option(section, name):
return self.config.getboolean(section, name)
else:
return default
def getfloat(self, section, name, default=None):
if default is None or self.config.has_option(section, name):
return self.config.getfloat(section, name)
else:
return default
def __str__(self):
ret = ""
for section in self.config.sections():
for option in self.config.options(section):
ret = ret + "%s:%s = %s\n" % (
section, option, self.config.get(section, option))
return ret

View File

@ -1,156 +1,171 @@
#!/usr/bin/env python2
#
# Filter a parallel corpus
#
"""Filter a parallel corpus."""
import heapq
import logging
import math
import optparse
import random
import sys
from defaultconfig import Config
logging.basicConfig(format = "%(asctime)-15s %(message)s")
logging.basicConfig(format="%(asctime)-15s %(message)s")
log = logging.getLogger("filter")
log.setLevel(logging.DEBUG)
class FilterStrategy(object):
def __init__(self,config):
pass
def filter(self,source,target):
return True
class FilterStrategy(object):
def __init__(self, config):
pass
def filter(self, source, target):
return True
class RandomFilterStrategy(FilterStrategy):
def __init__(self,config):
self.threshold = config.getfloat("random", "threshold", 0.1)
random.seed()
def __init__(self, config):
self.threshold = config.getfloat("random", "threshold", 0.1)
random.seed()
def filter(self, source, target):
return random.random() < self.threshold
def filter(self, source, target):
return random.random() < self.threshold
class ScoreFilterStrategy(FilterStrategy):
"""Filter strategy that is based on a file with sentence scores. There are three
possible ways of specifying how to filter:
i) threshold - filter all sentence pairs whose score is less than the threshold
ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences
"""Filter strategy that is based on a file with sentence scores.
There are three possible ways of specifying how to filter:
i) threshold - filter all sentence pairs whose score is less than the
threshold.
ii) proportion - filter all but a certain proportion (eg a tenth) of the
sentences.
iii) count - filter all but a given count of the sentences.
"""
def __init__(self,config):
section = "score"
self.score_file = config.get(section,"score_file")
self.ignore_score = config.get(section, "ignore_score", "99999")
option_names = ("threshold", "proportion", "count")
options = [config.config.has_option(section,o) for o in option_names]
if sum(options) != 1:
raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names))
if options[0]:
# threshold
self.threshold = config.getfloat(section,option_names[0])
else:
# proportion or count
if options[2]:
count = config.getint(section,option_names[2])
else:
# need to count entries
count = 0
ignore_count = 0
for line in open(self.score_file):
if line[:-1] != self.ignore_score:
count = count + 1
else:
ignore_count = ignore_count + 1
count = int(count * config.getfloat(section,option_names[1]))
log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count))
# Find the threshold
self.threshold = sorted(\
[float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count]
#self.threshold = heapq.nlargest(count, \
# [float(line[:-1]) for line in open(self.score_file)])[-1]
def __init__(self, config):
section = "score"
self.score_file = config.get(section, "score_file")
self.ignore_score = config.get(section, "ignore_score", "99999")
option_names = ("threshold", "proportion", "count")
options = [config.config.has_option(section, o) for o in option_names]
if sum(options) != 1:
raise RuntimeError(
"Must specify exactly one of %s for score filter"
% str(option_names))
if options[0]:
# Threshold.
self.threshold = config.getfloat(section, option_names[0])
else:
# proportion or count
if options[2]:
count = config.getint(section, option_names[2])
else:
# Need to count entries.
count = 0
ignore_count = 0
for line in open(self.score_file):
if line[:-1] != self.ignore_score:
count += 1
else:
ignore_count = ignore_count + 1
count = int(count * config.getfloat(section, option_names[1]))
log.info(
"Retaining at least %d entries and ignoring %d"
% (count, ignore_count))
# Find the threshold.
self.threshold = sorted([
float(line[:-1])
for line in open(self.score_file)],
reverse=True)[ignore_count + count]
# import heapq
# self.threshold = heapq.nlargest(
# count,
# [float(line[:-1]) for line in open(self.score_file)])[-1]
self.sfh = open(self.score_file)
log.info("Thresholding scores at " + str(self.threshold))
self.sfh = open(self.score_file)
log.info("Thresholding scores at " + str(self.threshold))
def filter(self, source, target):
score = self.sfh.readline()
if not score:
raise RuntimeError("score file truncated")
return (
score[:-1] == self.ignore_score or
float(score[:-1]) >= self.threshold
)
def filter(self,source,target):
score = self.sfh.readline()
if not score:
raise RuntimeError("score file truncated")
return score[:-1] == self.ignore_score or float(score[:-1]) >= self.threshold
def main():
parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file")
(options,args) = parser.parse_args()
if len(args) < 1:
parser.error("No configuration file specified")
parser = optparse.OptionParser(usage="Usage: %prog [options] config-file")
(options, args) = parser.parse_args()
if len(args) < 1:
parser.error("No configuration file specified")
log.info("Loading configuration from " + args[0])
config = Config(args[0])
log.debug("Configuration:\n" + str(config))
log.info("Loading configuration from " + args[0])
config = Config(args[0])
log.debug("Configuration:\n" + str(config))
# Required general parameters
source_lang = config.get("general", "source_language")
target_lang = config.get("general", "target_language")
input_stem = config.get("general", "input_stem")
output_stem = config.get("general", "output_stem")
strategy = config.get("general", "strategy", "")
# Required general parameters
source_lang = config.get("general", "source_language")
target_lang = config.get("general", "target_language")
input_stem = config.get("general", "input_stem")
output_stem = config.get("general", "output_stem")
strategy = config.get("general", "strategy", "")
# Optional general parameters
alignment_stem = config.get("general", "alignment_stem", "")
alignment_type = config.get("general", "alignment_type", "grow-diag-final-and")
domain_file_in = config.get("general", "domain_file", "")
domain_file_out = config.get("general", "domain_file_out", "")
# Optional general parameters
alignment_stem = config.get("general", "alignment_stem", "")
alignment_type = config.get(
"general", "alignment_type", "grow-diag-final-and")
domain_file_in = config.get("general", "domain_file", "")
domain_file_out = config.get("general", "domain_file_out", "")
strategy_class = globals()[strategy + "FilterStrategy"]
strategy = strategy_class(config)
strategy_class = globals()[strategy + "FilterStrategy"]
strategy = strategy_class(config)
source_input_fh = open(input_stem + "." + source_lang)
target_input_fh = open(input_stem + "." + target_lang)
source_output_fh = open(output_stem + "." + source_lang, "w")
target_output_fh = open(output_stem + "." + target_lang, "w")
source_input_fh = open(input_stem + "." + source_lang)
target_input_fh = open(input_stem + "." + target_lang)
source_output_fh = open(output_stem + "." + source_lang, "w")
target_output_fh = open(output_stem + "." + target_lang, "w")
alignment_input_fh = None
alignment_output_fh = None
if alignment_stem:
alignment_input_fh = open(alignment_stem + "." + alignment_type)
alignment_output_fh = open(output_stem + "." + alignment_type,"w")
alignment_input_fh = None
alignment_output_fh = None
if alignment_stem:
alignment_input_fh = open(alignment_stem + "." + alignment_type)
alignment_output_fh = open(output_stem + "." + alignment_type, "w")
domain_boundaries = {}
if domain_file_in:
dfh = open(domain_file_in)
for line in dfh:
line_no,name = line[:-1].split()
domain_boundaries[int(line_no)] = name
domain_output_fh = None
if domain_file_out:
domain_output_fh = open(domain_file_out, "w")
domain_boundaries = {}
if domain_file_in:
dfh = open(domain_file_in)
for line in dfh:
line_no, name = line[:-1].split()
domain_boundaries[int(line_no)] = name
#log.info(str(domain_boundaries))
domain_output_fh = None
if domain_file_out:
domain_output_fh = open(domain_file_out, "w")
# log.info(str(domain_boundaries))
retained = 0
line_no = 0
for source_line in source_input_fh:
target_line = target_input_fh.readline()
if alignment_input_fh:
align_line = alignment_input_fh.readline()
if strategy.filter(source_line, target_line):
retained = retained + 1
print>>source_output_fh, source_line,
print>>target_output_fh, target_line,
if alignment_input_fh:
print>>alignment_output_fh, align_line,
line_no = line_no + 1
# Check if this is a domain boundary.
if domain_boundaries and line_no in domain_boundaries:
print >>domain_output_fh, (
"%d %s" % (retained, domain_boundaries[line_no]))
log.info("Lines retained: %d", retained)
retained = 0
line_no = 0
for source_line in source_input_fh:
target_line = target_input_fh.readline()
if alignment_input_fh:
align_line = alignment_input_fh.readline()
if strategy.filter(source_line,target_line):
retained = retained + 1
print>>source_output_fh, source_line,
print>>target_output_fh, target_line,
if alignment_input_fh:
print>>alignment_output_fh, align_line,
line_no = line_no + 1
# check if this is a domain boundary
if domain_boundaries and domain_boundaries.has_key(line_no):
print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no])
log.info("Lines retained: %d" % retained)
if __name__ == "__main__":
main()
main()

View File

@ -2,73 +2,73 @@
# compute Bleu scores with confidence intervals via boostrap resampling
# written by Ulrich Germann
import math,sys,os
from argparse import ArgumentParser
from operator import itemgetter
from random import randint
from operator import itemgetter
import math
import os
from random import randint
import sys
def count_ngrams(snt,max_n):
def count_ngrams(snt, max_n):
"""
Return a dictionary of ngram counts (up to length /max_n/)
for sentence (list of words) /snt/.
Return a dictionary of ngram counts (up to length /max_n/)
for sentence (list of words) /snt/.
"""
ret = {}
for i in xrange(len(snt)):
for k in xrange(i+1,min(i+max_n+1,len(snt)+1)):
for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
key = tuple(snt[i:k])
ret[key] = ret.get(key,0) + 1
pass
pass
ret[key] = ret.get(key, 0) + 1
return ret
def max_counts(ng1,ng2):
def max_counts(ng1, ng2):
"""
Return a dicitonary of ngram counts such that
Return a dicitonary of ngram counts such that
each count is the greater of the two individual counts
for each ngram in the input ngram count dictionaries
for each ngram in the input ngram count dictionaries
/ng1/ and /ng2/.
"""
ret = ng1.copy()
for k,v in ng2.items():
ret[k] = max(ret.get(k,0),v)
pass
for k, v in ng2.items():
ret[k] = max(ret.get(k, 0), v)
return ret
def ng_hits(hyp,ref,max_n):
def ng_hits(hyp, ref, max_n):
"""
return a list of ngram counts such that each ngram count
is the minimum of the counts in hyp and ref, up to ngram
length /max_n/
Return a list of ngram counts such that each ngram count
is the minimum of the counts in hyp and ref, up to ngram
length /max_n/.
"""
ret = [0 for i in xrange(max_n)]
for ng,cnt in hyp.items():
for ng, cnt in hyp.items():
k = ng
if len(k) <= max_n:
ret[len(k)-1] += min(cnt,ref.get(ng,0))
pass
pass
ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
return ret
class BleuScore:
def __init__(self,hyp,ref,max_n=4,bootstrap=1000):
# print len(hyp.ngrams),len(ref.ngrams),"X"
self.hits = [ng_hits(hyp.ngrams[i],ref.ngrams[i],max_n)
for i in xrange(len(hyp.ngrams))]
self.max_n = max_n
self.hyp = hyp
self.ref = ref
self.lower = None
self.upper = None
def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
# print len(hyp.ngrams), len(ref.ngrams), "X"
self.hits = [
ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
for i in xrange(len(hyp.ngrams))]
self.max_n = max_n
self.hyp = hyp
self.ref = ref
self.lower = None
self.upper = None
self.median = None
self.bootstrap = [self.score([randint(0,len(hyp.snt)-1) for s in hyp.snt])
for i in xrange(1000)]
self.bootstrap = [
self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt])
for i in xrange(1000)]
self.bootstrap.sort()
self.actual = self.score([i for i in xrange(len(hyp.snt))])
return
def score(self,sample):
hits = [0 for i in xrange(self.max_n)]
def score(self, sample):
hits = [0 for i in xrange(self.max_n)]
self.hyplen = 0
self.reflen = 0
for i in sample:
@ -76,94 +76,89 @@ class BleuScore:
self.reflen += len(self.ref.snt[i])
for n in xrange(self.max_n):
hits[n] += self.hits[i][n]
pass
pass
self.prec = [float(hits[n])/(self.hyplen-n*len(sample))
self.prec = [float(hits[n]) / (self.hyplen - n * len(sample))
for n in xrange(self.max_n)]
ret = sum([math.log(x) for x in self.prec])/self.max_n
self.BP = min(1,math.exp(1.-float(self.reflen)/float(self.hyplen)))
ret = sum([math.log(x) for x in self.prec]) / self.max_n
self.BP = min(
1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
ret += math.log(self.BP)
return math.exp(ret)
class Document:
def __init__(self,fname=None):
def __init__(self, fname=None):
self.fname = fname
if fname:
self.snt = [line.strip().split() for line in open(fname)]
self.ngrams = [count_ngrams(snt,4) for snt in self.snt]
self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
else:
self.snt = None
self.ngrams = None
pass
return
def merge(self,R):
def merge(self, R):
self.fname = "multi-ref"
self.ngrams = [x for x in R[0].ngrams]
self.snt = [x for x in R[0].snt]
for i in xrange(len(R[0].ngrams)):
for k in xrange(1,len(R)):
self.ngrams[i] = max_counts(self.ngrams[i],R[k].ngrams[i])
pass
pass
return
for k in xrange(1, len(R)):
self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
def update(self,hyp,R):
for i in xrange(len(hyp.snt)):
clen = len(hyp.snt[i])
def update(self, hyp, R):
for i, hyp_snt in enumerate(hyp.snt):
clen = len(hyp_snt)
K = 0
for k in xrange(1,len(R)):
assert len(R[k].snt) == len(hyp.snt),\
"Mismatch in numer of sentences " +\
"between reference and candidate"
if abs(len(R[k].snt[i]) - clen) == abs(len(R[K].snt[i]) - clen):
if len(R[k].snt[i]) < len(R[K].snt[i]):
for k in xrange(1, len(R)):
k_snt = R[k].snt[i]
assert len(R[k].snt) == len(hyp.snt), (
"Mismatch in number of sentences " +
"between reference and candidate")
if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
if len(k_snt) < len(R[K].snt[i]):
K = k
pass
pass
elif abs(len(R[k].snt[i]) - clen) < abs(len(R[K].snt[i]) - clen):
elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
K = k
pass
pass
self.snt[i] = R[K].snt[i]
pass
return
pass
if __name__ == "__main__":
argparser = ArgumentParser()
argparser.add_argument("-r","--ref",nargs='+',help="reference translation(s)")
argparser.add_argument("-c","--cand",nargs='+',help="candidate translations")
argparser.add_argument("-i","--individual",action='store_true',
help="compute BLEU scores for individual references")
argparser.add_argument("-b","--bootstrap",type=int,default=1000,
help="sample size for bootstrap resampling")
argparser.add_argument("-a","--alpha",help="1-alpha = confidence interval",type=float,default=.05)
argparser.add_argument(
"-r", "--ref", nargs='+', help="Reference translation(s).")
argparser.add_argument(
"-c", "--cand", nargs='+', help="Candidate translations.")
argparser.add_argument(
"-i", "--individual", action='store_true',
help="Compute BLEU scores for individual references.")
argparser.add_argument(
"-b", "--bootstrap", type=int, default=1000,
help="Sample size for bootstrap resampling.")
argparser.add_argument(
"-a", "--alpha", type=float, default=.05,
help="1-alpha = confidence interval.")
args = argparser.parse_args(sys.argv[1:])
R = [ Document(fname) for fname in args.ref]
C = [ Document(fname) for fname in args.cand]
Rx = Document() # for multi-reference BLEU
R = [Document(fname) for fname in args.ref]
C = [Document(fname) for fname in args.cand]
Rx = Document() # for multi-reference BLEU
Rx.merge(R)
for c in C:
# compute multi-reference BLEU
Rx.update(c,R)
bleu = BleuScore(c,Rx,bootstrap=args.bootstrap)
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s"%\
(100*bleu.actual,
os.path.basename(Rx.fname),
100*bleu.bootstrap[int((args.alpha/2)*args.bootstrap)],
100*bleu.bootstrap[int((1-(args.alpha/2))*args.bootstrap)],
100*bleu.bootstrap[int(.5*args.bootstrap)],
c.fname) # os.path.basename(c.fname))
Rx.update(c, R)
bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
100 * bleu.actual,
os.path.basename(Rx.fname),
100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
100 * bleu.bootstrap[int(.5 * args.bootstrap)],
c.fname) # os.path.basename(c.fname))
if args.individual:
for r in R:
bleu = BleuScore(c,r,bootstrap=args.bootstrap)
print " %5.2f %s"%(100*bleu.actual,os.path.basename(r.fname))
# print bleu.prec,bleu.hyplen,bleu.reflen,bleu.BP
pass
pass
bleu = BleuScore(c, r, bootstrap=args.bootstrap)
print " %5.2f %s" % (
100 * bleu.actual, os.path.basename(r.fname))
# print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
# print [sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) for n in xrange(4)]
pass
# print [
# sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
# for n in xrange(4)]

View File

@ -1,237 +1,225 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Python utilities for moses
#
# This package mostly wraps standard Moses utilities into pipes.
#
# Written by Ulrich Germann
#
# This package borrows from scripts written by Christian Buck
#
# The package assumes that there is a complete moses installation
# (including scripts) under one root directory,
# e.g., via
# bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
# By default, this root directory is "${HOME}/moses".
"""
Python utilities for moses
This package mostly wraps standard Moses utilities into pipes.
Written by Ulrich Germann
This package borrows from scripts written by Christian Buck
The package assumes that there is a complete moses installation
(including scripts) under one root directory,
e.g., via ::
bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
By default, this root directory is "${HOME}/moses".
"""
import os
import sys
import time
import xmlrpclib
from subprocess import (
PIPE,
Popen,
)
moses_root = os.environ.get('MOSES_ROOT', os.environ.get('HOME') + "/moses")
import xmlrpclib,datetime,argparse,time,os,sys
from subprocess import *
from unicodedata import normalize
moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses")
class ProcessWrapper:
def __init__(self,cmd=[]):
self.process = None
self.cmd = cmd
return
def __init__(self, cmd=[]):
self.process = None
self.cmd = cmd
def start(self, stdin=PIPE, stdout=PIPE):
if self.process:
raise Exception("Process is already running")
self.process = Popen(self.cmd, stdin = stdin, stdout = stdout)
return
def start(self, stdin=PIPE, stdout=PIPE):
if self.process:
raise Exception("Process is already running")
self.process = Popen(self.cmd, stdin=stdin, stdout=stdout)
def __del__(self):
if self.process:
self.process.terminate()
def __del__(self):
if self.process:
self.process.terminate()
pass
return
pass
class LineProcessor(ProcessWrapper):
def __call__(self,input):
if not self.process: self.start()
self.process.stdin.write("%s\n"%input.strip())
self.process.stdin.flush()
return self.process.stdout.readline().strip()
pass
def __call__(self, input):
if not self.process:
self.start()
self.process.stdin.write("%s\n" % input.strip())
self.process.stdin.flush()
return self.process.stdout.readline().strip()
class SentenceSplitter(ProcessWrapper):
"""
Wrapper for standard Moses sentence splitter
"""
def __init__(self,lang):
ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl"
self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang]
self.process = None
return
"""Wrapper for standard Moses sentence splitter."""
def __init__(self, lang):
ssplit_cmd = moses_root + "/scripts/ems/support/split-sentences.perl"
self.cmd = [ssplit_cmd, "-b", "-q", "-l", lang]
self.process = None
def __call__(self, input):
if not self.process:
self.start()
self.process.stdin.write(input.strip() + "\n<P>\n")
self.process.stdin.flush()
x = self.process.stdout.readline().strip()
ret = []
while x != '<P>' and x != '':
ret.append(x)
x = self.process.stdout.readline().strip()
return ret
def __call__(self,input):
if not self.process:
self.start()
pass
self.process.stdin.write(input.strip() + "\n<P>\n")
self.process.stdin.flush()
x = self.process.stdout.readline().strip()
ret = []
while x != '<P>' and x != '':
ret.append(x)
x = self.process.stdout.readline().strip()
pass
return ret
class Pretokenizer(LineProcessor):
"""
Pretokenizer wrapper; the pretokenizer fixes known issues with the input.
"""
def __init__(self,lang):
pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl"
self.cmd = [pretok_cmd,"-b", "-q", "-l",lang]
self.process = None
return
pass
"""Pretokenizer wrapper.
The pretokenizer fixes known issues with the input.
"""
def __init__(self, lang):
pretok_cmd = moses_root + "/scripts/tokenizer/pre-tokenizer.perl"
self.cmd = [pretok_cmd, "-b", "-q", "-l", lang]
self.process = None
class Tokenizer(LineProcessor):
"""
Tokenizer wrapper; the pretokenizer fixes known issues with the input.
"""
def __init__(self,lang,args=["-a","-no-escape"]):
tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl"
self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args
self.process = None
return
"""Tokenizer wrapper.
The pretokenizer fixes known issues with the input.
"""
def __init__(self, lang, args=["-a", "-no-escape"]):
tok_cmd = moses_root + "/scripts/tokenizer/tokenizer.perl"
self.cmd = [tok_cmd, "-b", "-q", "-l", lang] + args
self.process = None
class Truecaser(LineProcessor):
"""
Truecaser wrapper.
"""
def __init__(self,model):
truecase_cmd = moses_root+"/scripts/recaser/truecase.perl"
self.cmd = [truecase_cmd,"-b", "--model",model]
self.process = None
return
pass
"""Truecaser wrapper."""
def __init__(self, model):
truecase_cmd = moses_root + "/scripts/recaser/truecase.perl"
self.cmd = [truecase_cmd, "-b", "--model", model]
self.process = None
class LineProcessorPipeline:
"""
Line processor: one line in, one line out
"""
def __init__(self,parts=[]):
self.chain = [LineProcessor(p.cmd) for p in parts]
return
def start(self):
if len(self.chain) == 0:
return
if self.chain[0].process:
return
self.chain[0].start()
for i in xrange(1,len(self.chain)):
self.chain[i].start(stdin = self.chain[i-1].process.stdout)
pass
return
"""Line processor: one line in, one line out."""
def __init__(self, parts=[]):
self.chain = [LineProcessor(p.cmd) for p in parts]
def __call__(self,input):
if len(self.chain) == 0:
return input
self.start()
self.chain[0].process.stdin.write("%s\n"%input.strip())
self.chain[0].process.stdin.flush()
return self.chain[0].process.stdout.readline().strip()
def start(self):
if len(self.chain) == 0:
return
if self.chain[0].process:
return
self.chain[0].start()
for i in xrange(1, len(self.chain)):
self.chain[i].start(stdin=self.chain[i - 1].process.stdout)
def __call__(self, input):
if len(self.chain) == 0:
return input
self.start()
self.chain[0].process.stdin.write("%s\n" % input.strip())
self.chain[0].process.stdin.flush()
return self.chain[0].process.stdout.readline().strip()
pass
def find_free_port(p):
"""
Find a free port, starting at /p/.
Return the free port, or False if none found.
"""
ret = p
while ret - p < 20:
devnull = open(os.devnull,"w")
n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
if n.communicate()[0].find(":%d "%ret) < 0:
return p
ret += 1
pass
return False
"""Find a free port, starting at /p/.
:return: The free port, or False if none found.
"""
ret = p
while ret - p < 20:
devnull = open(os.devnull, "w")
n = Popen(["netstat", "-tnp"], stdout=PIPE, stderr=devnull)
if n.communicate()[0].find(":%d " % ret) < 0:
return p
ret += 1
return False
class MosesServer(ProcessWrapper):
def __init__(self,args=[]):
self.process = None
mserver_cmd = moses_root+"/bin/mosesserver"
self.cmd = [mserver_cmd] + args
self.url = None
self.proxy = None
return
def start(self,config=None,args=[],port=7447,debug=False):
self.cmd.extend(args)
if config:
if "-f" in args:
raise Exception("Config file specified twice")
else:
self.cmd.extend(["-f",config])
pass
pass
self.port = port # find_free_port(port)
if not self.port:
raise Excpetion("Cannot find free port for moses server!")
self.cmd.extend(["--server-port", "%d"%self.port])
if debug:
print >>sys.stderr,self.cmd
# self.stderr = open("mserver.%d.stderr"%self.port,'w')
# self.stdout = open("mserver.%d.stdout"%self.port,'w')
# self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
self.process = Popen(self.cmd)
else:
devnull = open(os.devnull,"w")
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
pass
if self.process.poll():
raise Exception("FATAL ERROR: Could not launch moses server!")
if debug:
print >>sys.stderr,"MOSES port is %d."%self.port
print >>sys.stderr,"Moses poll status is", self.process.poll()
pass
self.url = "http://localhost:%d/RPC2"%self.port
self.connect(self.url)
return True
def connect(self,url):
if url[:4] != "http": url = "http://%s"%url
if url[-5:] != "/RPC2": url += "/RPC2"
self.url = url
self.proxy = xmlrpclib.ServerProxy(self.url)
return
def translate(self,input):
attempts = 0
while attempts < 100:
try:
if type(input) is unicode:
# if the server does not expect unicode, provide a
# properly encoded string!
param = {'text': input.strip().encode('utf8')}
return self.proxy.translate(param)['text'].decode('utf8')
elif type(input) is str:
param = {'text': input.strip()}
return self.proxy.translate(param)['text']
elif type(input) is list:
return [self.translate(x) for x in input]
elif type(input) is dict:
return self.proxy.translate(input)
def __init__(self, args=[]):
self.process = None
mserver_cmd = moses_root + "/bin/mosesserver"
self.cmd = [mserver_cmd] + args
self.url = None
self.proxy = None
def start(self, config=None, args=[], port=7447, debug=False):
self.cmd.extend(args)
if config:
if "-f" in args:
raise Exception("Config file specified twice")
else:
self.cmd.extend(["-f", config])
self.port = port # find_free_port(port)
if not self.port:
raise Exception("Cannot find free port for moses server!")
self.cmd.extend(["--server-port", "%d" % self.port])
if debug:
print >>sys.stderr, self.cmd
# self.stderr = open("mserver.%d.stderr"%self.port,'w')
# self.stdout = open("mserver.%d.stdout"%self.port,'w')
# self.process = Popen(
# self.cmd, stderr=self.stderr, stdout=self.stdout)
self.process = Popen(self.cmd)
else:
raise Exception("Can't handle input of this type!")
devnull = open(os.devnull, "w")
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
except:
attempts += 1
print >>sys.stderr, "WAITING", attempts
time.sleep(1)
pass
pass
raise Exception("Translation request failed")
pass
if self.process.poll():
raise Exception("FATAL ERROR: Could not launch moses server!")
if debug:
print >>sys.stderr, "MOSES port is %d." % self.port
print >>sys.stderr, "Moses poll status is", self.process.poll()
self.url = "http://localhost:%d/RPC2" % self.port
self.connect(self.url)
return True
def connect(self, url):
if url[:4] != "http":
url = "http://%s" % url
if url[-5:] != "/RPC2":
url += "/RPC2"
self.url = url
self.proxy = xmlrpclib.ServerProxy(self.url)
def translate(self, input):
attempts = 0
while attempts < 100:
try:
if type(input) is unicode:
# If the server does not expect unicode, provide a
# properly encoded string!
param = {'text': input.strip().encode('utf8')}
return self.proxy.translate(param)['text'].decode('utf8')
elif type(input) is str:
param = {'text': input.strip()}
return self.proxy.translate(param)['text']
elif type(input) is list:
return [self.translate(x) for x in input]
elif type(input) is dict:
return self.proxy.translate(input)
else:
raise Exception("Can't handle input of this type!")
except:
attempts += 1
print >>sys.stderr, "WAITING", attempts
time.sleep(1)
raise Exception("Translation request failed")

View File

@ -5,29 +5,39 @@
# This script simulates post-editing of MT output and incrementally
# updates the dynamic phrase tables in the moses server.
import xmlrpclib,datetime,argparse,sys,os,time
import argparse
import os
import sys
import time
import xmlrpclib
import moses
from moses import MosesServer
from subprocess import *
from subprocess import (
PIPE,
Popen,
)
mserver = moses.MosesServer()
# We must perform some custom argument processing, as moses parameter
# specifications do not comply with the standards used in standard
# argument parsing packages; an isolated double dash separates script
# arguments from moses arguments
def split_args(all_args):
"""
Split argument list all_args into arguments specific to this script and
arguments relating to the moses server. An isolated double dash acts as
the separator between the two types of arguments.
arguments relating to the moses server. An isolated double dash acts as
the separator between the two types of arguments.
"""
my_args = []
mo_args = []
arglist = mo_args
i = 0
# IMPORTANT: the code below must be coordinated with
# IMPORTANT: the code below must be coordinated with
# - the evolution of moses command line arguments
# - mert-moses.pl
# - mert-moses.pl
while i < len(all_args):
# print i,"MY_ARGS", my_args
# print i,"MO_ARGS", mo_args
@ -36,14 +46,16 @@ def split_args(all_args):
elif all_args[i] == "--]":
arglist = mo_args
elif all_args[i] == "-i" or all_args[i] == "-input-file":
my_args.extend(["-i",all_args[i+1]])
my_args.extend(["-i", all_args[i + 1]])
i += 1
elif all_args[i] == "-inputtype":
if all_args[i+1] != "0":
# not yet supported! Therefore:
errmsg = "FATAL ERROR: %s "%sys.argv[0]
errmsg += "only supports plain text input at this point."
raise Exception(errsmg)
if all_args[i + 1] != "0":
# Not yet supported! Therefore:
errmsg = (
"FATAL ERROR: "
"%s only supports plain text input at this point."
% sys.argv[0])
raise Exception(errmsg)
# my_args.extend(["--input-type",all_args[i+1]])
i += 1
elif all_args[i] == "-lattice-samples":
@ -52,13 +64,14 @@ def split_args(all_args):
# mo_args[i:i+3] = []
# i += 2
# This is not yet supported! Therefore:
errmsg = "FATAL ERROR: %s "%sys.argv[0]
errmsg += "does not yet support lattice sampling."
raise Exception(errsmg)
errmsg = (
"FATAL ERROR: %s does not yet support lattice sampling."
% sys.argv[0])
raise Exception(errmsg)
elif all_args[i] == "-n-best-list":
my_args.extend(["--nbest",all_args[i+2]])
my_args.extend(["--nbest-file",all_args[i+1]])
my_args.extend(["--nbest", all_args[i + 2]])
my_args.extend(["--nbest-file", all_args[i + 1]])
i += 2
elif all_args[i] == "-n-best-distinct":
@ -70,128 +83,148 @@ def split_args(all_args):
i += 1
pass
return my_args,mo_args
return my_args, mo_args
def interpret_args(my_args):
"""
Parse script-specific argument list.
"""
aparser = argparse.ArgumentParser()
aparser.add_argument("-s","--server-cmd",default="mosesserver",
dest="servercmd", help="path to moses server command")
aparser.add_argument("--url",help="URL of external moses server.")
aparser.add_argument("-p","--port", type=int, default=7447,
help="port number to be used for server")
# input / output
aparser.add_argument("-i","--input",help="source file",default="-")
aparser.add_argument("-r","--ref",help="reference translation",default=None)
aparser.add_argument("-a","--aln",help="alignment",default=None)
aparser.add_argument("-o","--output",default="-",help="output file")
aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
# moses reporting options
aparser.add_argument("-A","--with-alignment", dest="A",
help="include alignment in output", action="store_true")
aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
help="include search graph info in output")
aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
help="include translation options info in output")
aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
help="report all factors")
aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0,
help="size of nbest list")
aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
help="output file for nbest list")
aparser.add_argument("-u","--nbest-distinct",type=bool,dest="U",default=False,
help="report all factors")
aparser.add_argument(
"-s", "--server-cmd", default="mosesserver", dest="servercmd",
help="Path to moses server command.")
aparser.add_argument(
"--url", help="URL of external moses server.")
aparser.add_argument(
"-p", "--port", type=int, default=7447,
help="Port number to be used for server.")
# Input / output.
aparser.add_argument(
"-i", "--input", default='-', help="source file")
aparser.add_argument(
"-r", "--ref", default=None, help="Reference translation.")
aparser.add_argument(
"-a", "--aln", default=None, help="Alignment.")
aparser.add_argument(
"-o", "--output", default="-", help="Output file.")
aparser.add_argument(
"-d", "--debug", action='store_true', help="Debug mode.")
# Moses reporting options.
aparser.add_argument(
"-A", "--with-alignment", dest="A", action='store_true',
help="Include alignment in output.")
aparser.add_argument(
"-G", "--with-graph", type=bool, default=False, dest="G",
help="Include search graph info in output.")
aparser.add_argument(
"-T", "--with-transopt", type=bool, default=False, dest="T",
help="Include translation options info in output.")
aparser.add_argument(
"-F", "--report-all-factors", action="store_true", dest="F",
help="Report all factors.")
aparser.add_argument(
"-n", "--nbest", type=int, dest="nbest", default=0,
help="Size of nbest list.")
aparser.add_argument(
"-N", "--nbest-file", dest="nbestFile", default=0,
help="Output file for nbest list.")
aparser.add_argument(
"-u", "--nbest-distinct", type=bool, dest="U", default=False,
help="Report all factors.")
return aparser.parse_args(my_args)
def translate(proxy, args, line):
if type(line) is unicode:
param = { 'text' : line.strip().encode('utf8') }
param = {'text': line.strip().encode('utf8')}
elif type(line) is str:
param = { 'text' : line.strip() }
param = {'text': line.strip()}
else:
raise Exception("Can't handle input")
if args.A: param['align'] = True
if args.T: param['topt'] = True
if args.F: param['report-all-factors'] = True
if args.nbest:
if args.A:
param['align'] = True
if args.T:
param['topt'] = True
if args.F:
param['report-all-factors'] = True
if args.nbest:
param['nbest'] = int(args.nbest)
param['add-score-breakdown'] = True
pass
if args.U:
if args.U:
param['nbest-distinct'] = True
pass
attempts = 0
while attempts < 20:
t1 = time.time()
try:
return proxy.translate(param)
return proxy.translate(param)
# except xmlrpclib.Fault as e:
# except xmlrpclib.ProtocolError as e:
# except xmlrpclib.ResponseError as e:
except xmlrpclib.Error as e:
time.sleep(2) # give all the stderr stuff a chance to be flushed
print >>sys.stderr," XMLRPC error:",e
sys.stderr.flush()
print >>sys.stderr, " XMLRPC error:", e
print >>sys.stderr, "Input was"
print >>sys.stderr, param
sys.exit(1)
except IOError as e:
print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
print >>sys.stderr, (
"I/O error({0}): {1}".format(e.errno, e.strerror))
time.sleep(5)
except:
serverstatus = mserver.process.poll()
if serverstatus == None:
print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
if serverstatus is None:
print >>sys.stderr, (
"Connection failed after %f seconds" % (time.time() - t1))
attempts += 1
if attempts > 10:
time.sleep(10)
else:
time.sleep(5)
pass
else:
print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
%(serverstatus/256,serverstatus%256)
print >>sys.stderr, (
"Oopsidaisy, server exited with code %d (signal %d)"
% (serverstatus / 256, serverstatus % 256))
pass
pass
pass
raise Exception("Exception: could not reach translation server.")
def read_data(fname):
"""
Read and return data (source, target or alignment) from file fname.
"""
if fname[-3:] == ".gz":
foo = Popen(["zcat",fname],stdout=PIPE)\
.communicate()[0]\
.strip().split('\n')
process = Popen(["zcat", fname], stdout=PIPE)
stdout, _ = process.communicate()
foo = stdout.strip().split('\n')
else:
foo = [x.strip() for x in open(fname).readlines()]
pass
return foo
def repack_result(idx,result):
def repack_result(idx, result):
global args
if args.nbest:
for h in result['nbest']:
fields = [idx,h['hyp'],h['fvals'],h['totalScore']]
fields = [idx, h['hyp'], h['fvals'], h['totalScore']]
for i in xrange(len(fields)):
if type(fields[i]) is unicode:
fields[i] = fields[i].encode('utf-8')
pass
pass
# print fields
print >>NBestFile,"%d ||| %s ||| %s ||| %f"%tuple(fields)
pass
# Print fields.
print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields)
pass
if 'align' in result:
t = result['text'].split()
@ -200,16 +233,14 @@ def repack_result(idx,result):
k = 0
for a in result['align']:
k = a['tgt-start']
if k: print " ".join(t[i:k]).encode('utf8'),span,
if k:
print " ".join(t[i:k]).encode('utf8'), span,
i = k
span = "|%d %d|"%(a['src-start'],a['src-end'])
pass
print " ".join(t[k:]).encode('utf8'),span
pass
span = "|%d %d|" % (a['src-start'], a['src-end'])
print " ".join(t[k:]).encode('utf8'), span
else:
print result['text'].encode('utf8')
pass
return
if __name__ == "__main__":
my_args, mo_args = split_args(sys.argv[1:])
@ -221,17 +252,17 @@ if __name__ == "__main__":
args = interpret_args(my_args)
if "-show-weights" in mo_args:
# this is for use during tuning, where moses is called to get a list of
# feature names
devnull = open(os.devnull,"w")
mo = Popen(mserver.cmd + mo_args,stdout=PIPE,stderr=devnull)
# This is for use during tuning, where moses is called to get a list
# of feature names.
devnull = open(os.devnull, "w")
mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull)
print mo.communicate()[0].strip()
sys.exit(0)
pass
if args.nbest:
if args.nbestFile:
NBestFile = open(args.nbestFile,"w")
NBestFile = open(args.nbestFile, "w")
else:
NBestFile = sys.stdout
pass
@ -239,8 +270,10 @@ if __name__ == "__main__":
ref = None
aln = None
if args.ref: ref = read_data(args.ref)
if args.aln: aln = read_data(args.aln)
if args.ref:
ref = read_data(args.ref)
if args.aln:
aln = read_data(args.aln)
if ref and aln:
try:
@ -260,25 +293,21 @@ if __name__ == "__main__":
line = sys.stdin.readline()
idx = 0
while line:
result = translate(mserver.proxy,args,line)
repack_result(idx,result)
result = translate(mserver.proxy, args, line)
repack_result(idx, result)
line = sys.stdin.readline()
idx += 1
pass
pass
else:
src = read_data(args.input)
for i in xrange(len(src)):
result = translate(mserver.proxy,args,src[i])
repack_result(i,result)
result = translate(mserver.proxy, args, src[i])
repack_result(i, result)
if args.debug:
print >>sys.stderr, result['text'].encode('utf-8')
pass
if ref and aln:
result = mserver.proxy.updater({'source' : src[i],
'target' : ref[i],
'alignment' : aln[i]})
pass
pass
pass
pass
if ref and aln:
result = mserver.proxy.updater({
'source': src[i],
'target': ref[i],
'alignment': aln[i],
})

View File

@ -2,12 +2,12 @@
"""
The Gacha filter cleans out sentence pairs that have global character mean
lower than a certain threshold.
Use this cleaner to produce low quantity of high quality sentence pairs.
lower than a certain threshold.
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
Use this cleaner to produce low quantity of high quality sentence pairs.
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
This is inspired by the global character mean that is used in the Gale-Church
@ -24,17 +24,24 @@ where:
(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
"""
import io, subprocess
import io
import subprocess
red = '\033[01;31m'
native = '\033[m'
def err_msg(txt):
return red+txt+native
return red + txt + native
def num_char(filename):
return float(subprocess.Popen(["wc", "-m", filename],
stdout=subprocess.PIPE).stdout.read().split()[0])
process = subprocess.Popen(
["wc", "-m", filename], stdout=subprocess.PIPE)
# TODO: Was this meant to call communicate()?
return float(process.stdout.read().split()[0])
def gacha_mean(sourcefile, targetfile):
"""
@ -43,36 +50,44 @@ def gacha_mean(sourcefile, targetfile):
"""
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
c = num_char(sourcefile) / num_char(targetfile)
sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
sys.stderr.write(err_msg('Filtering starts ...\n'))
return c
def io_open(path):
"""Open file `path` for reading, as a UTF-8 text file."""
return io.open(path, 'r', encoding='utf8')
def main(sourcefile, targetfile, threshold=0.2):
# Calculates Gacha mean.
c = gacha_mean(sourcefile, targetfile)
# Calculates lower and upperbound for filtering
threshold = float(threshold)
lowerbound = (1-threshold) * c
upperbound = (1+threshold) * c
lowerbound = (1 - threshold) * c
upperbound = (1 + threshold) * c
# Start filtering sentences.
with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
io.open(targetfile, 'r', encoding='utf8') as trgfin:
with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
for s, t in zip(srcfin, trgfin):
if lowerbound < len(s) / float(len(t)) < upperbound:
print(u"{}\t{}\n".format(s.strip(),t.strip()))
print(u"{}\t{}\n".format(s.strip(), t.strip()))
if __name__ == '__main__':
import sys
if len(sys.argv) not in range(3,5):
usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
% sys.argv[0])
example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
'~/Europarl.de-en.en 0.4\n'
% sys.argv[0])
if len(sys.argv) not in range(3, 5):
usage_msg = err_msg(
"Usage: python %s srcfile trgfile (threshold)\n"
% sys.argv[0])
example_msg = err_msg(
"Example: "
"gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n"
% sys.argv[0])
sys.stderr.write(usage_msg)
sys.stderr.write(example_msg)
sys.exit(1)
main(*sys.argv[1:])

View File

@ -24,9 +24,11 @@
import optparse
import sys
class NGram(tuple):
pass
class Gap:
def __init__(self, minSpan):
self.minSpan = minSpan
@ -34,8 +36,12 @@ class Gap:
def getMinSpan(self):
return self.minSpan
def printUsage():
sys.stderr.write("Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
sys.stderr.write(
"Usage: "
"filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
def main():
parser = optparse.OptionParser()
@ -54,14 +60,15 @@ def main():
inputSentences.append(line.split())
filterRuleTable(sys.stdin, inputSentences, N, options)
def filterRuleTable(ruleTable, inputSentences, N, options):
# Map each input n-gram (n = 1..N) to a map from sentence indices to
# lists of intra-sentence indices.
occurrences = {}
for i, sentence in enumerate(inputSentences):
for n in range(1, N+1):
for j in range(0, len(sentence)-n+1):
ngram = NGram(sentence[j:j+n])
for n in range(1, N + 1):
for j in range(0, len(sentence) - n + 1):
ngram = NGram(sentence[j:j + n])
innerMap = occurrences.setdefault(ngram, {})
indices = innerMap.setdefault(i, [])
indices.append(j)
@ -70,15 +77,16 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
prevRuleIncluded = None
for line in ruleTable:
rhs, count = parseRule(line)
below_threshold = (count is not None and count < options.minCount)
# Prune non-initial rule if count is below threshold.
if count != None and count < options.minCount and isNonInitialRule(rhs):
if below_threshold and isNonInitialRule(rhs):
if prevRHS != rhs:
prevRuleIncluded = None
prevRHS = rhs
continue
# If source RHS is same as last rule's then we already know whether to
# filter or not (unless it was pruned before checking).
if rhs == prevRHS and prevRuleIncluded != None:
if rhs == prevRHS and prevRuleIncluded is not None:
if prevRuleIncluded:
print line,
continue
@ -89,7 +97,10 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
prevRuleIncluded = True
continue
segments = segmentRHS(rhs, N)
ngramMaps = [occurrences.get(s, {}) for s in segments if isinstance(s, NGram)]
ngramMaps = [
occurrences.get(s, {})
for s in segments
if isinstance(s, NGram)]
if len(ngramMaps) == 0:
print line,
prevRuleIncluded = True
@ -111,9 +122,13 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
break
prevRuleIncluded = match
# Parse a line of the rule table and return a tuple containing two items,
# the list of RHS source symbols and the rule count (if present).
def parseRule(line):
"""Parse a line of the rule table.
:return: A tuple containing two items: the list of RHS source symbols,
and the rule count (if present).
"""
cols = line.split("|||")
rhsSourceSymbols = cols[0].split()[:-1]
ruleCount = None
@ -123,15 +138,18 @@ def parseRule(line):
ruleCount = float(counts[2])
return (rhsSourceSymbols, ruleCount)
def isNT(symbol):
return symbol[0] == '[' and symbol[-1] == ']'
def isNonInitialRule(rhs):
for symbol in rhs:
if isNT(symbol):
return True
return False
def segmentRHS(rhs, N):
segments = []
terminals = []
@ -159,13 +177,14 @@ def segmentRHS(rhs, N):
segments.append(NGram(terminals))
return segments
def matchSegments(segments, indexSeq, sentenceLength):
assert len(segments) > 0
firstSegment = segments[0]
i = 0
if isinstance(firstSegment, Gap):
minPos = firstSegment.getMinSpan()
maxPos = sentenceLength-1
maxPos = sentenceLength - 1
else:
minPos = indexSeq[i] + len(firstSegment)
i += 1
@ -175,7 +194,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
if minPos + segment.getMinSpan() > sentenceLength:
return False
minPos = minPos + segment.getMinSpan()
maxPos = sentenceLength-1
maxPos = sentenceLength - 1
else:
pos = indexSeq[i]
i += 1
@ -185,6 +204,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
maxPos = minPos
return True
def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
assert len(ngramMaps) > 0
if len(ngramMaps) == 1:
@ -195,7 +215,7 @@ def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
for index in ngramMaps[0][sentenceIndex]:
if index < minFirstIndex:
continue
for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index+1):
for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index + 1):
assert seq[0] > index
yield [index] + seq

View File

@ -2,18 +2,23 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# average embeddings of special null words for RDLM.
# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
"""Average embeddings of special null words for RDLM.
Usage:
average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
"""
import sys
import os
import numpy
def load_model(model_file):
return nplm.NeuralLM.from_file(model_file)
def get_weights(path, vocab, len_context):
d = [[0]*vocab for i in range(len_context)]
d = [[0] * vocab for i in range(len_context)]
for line in open(path):
for i, word in enumerate(line.split()[:-1]):
d[i][int(word)] += 1
@ -26,20 +31,23 @@ if __name__ == "__main__":
training_instances = sys.argv[3]
model_output = sys.argv[4]
sys.path.append(os.path.join(nplm_path,'python'))
sys.path.append(os.path.join(nplm_path, 'python'))
import nplm
model = load_model(model_input)
len_context = len(open(training_instances).readline().split())-1
len_context = len(open(training_instances).readline().split()) - 1
sys.stderr.write('reading ngrams...')
weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
weights = numpy.array(
get_weights(
training_instances, len(model.input_embeddings), len_context))
sys.stderr.write('done\n')
for i in range(len_context):
index = model.word_to_index_input['<null_{0}>'.format(i)]
model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
model.input_embeddings[index] = numpy.average(
numpy.array(model.input_embeddings), weights=weights[i], axis=0)
sys.stderr.write('writing model...')
model.to_file(open(model_output,'w'))
model.to_file(open(model_output, 'w'))
sys.stderr.write('done\n')

View File

@ -2,17 +2,25 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
"""
Extract syntactic n-grams from dependency treebank in Moses XML format for
training RDLM.
Expected format can be produced with
mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped
to 0 (<unk>)
"""
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import argparse
# hack for python2/3 compatibility
# Hack for python2/3 compatibility
from io import open
argparse.open = open
try:
@ -20,46 +28,84 @@ try:
except ImportError:
from xml.etree import cElementTree as ET
def create_parser():
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
help='input file (default: standard input).')
parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
help='output file (default: standard output).')
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
choices=['label', 'head'], required=True)
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
help='input layer vocabulary file (one item per line; first line \'<unk>\')')
parser.add_argument('--output_vocab', metavar='PATH', type=str,
help='output layer vocabulary file (default: use input layer vocabulary)')
parser.add_argument('--left_context', metavar='INT', type=int,
help='size of context vector for left siblings (default: %(default)s)', default=3)
parser.add_argument('--right_context', metavar='INT', type=int,
help='size of context vector for right siblings (default: %(default)s)', default=0)
parser.add_argument('--up_context', metavar='INT', type=int,
help='size of context vector for ancestors (default: %(default)s)', default=2)
parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
help='glue symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--ptkvz', action='store_true',
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
def create_parser():
parser = argparse.ArgumentParser(
description=(
"Extract syntactic n-grams from parsed corpus in "
"Moses XML format for training RDLM"))
parser.add_argument(
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
metavar='PATH',
help='Input file (default: standard input).')
parser.add_argument(
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
metavar='PATH',
help='Output file (default: standard output).')
parser.add_argument(
'--mode', type=str, choices=['label', 'head'], required=True,
help='Predict terminals (head) or dependency labels (label).')
parser.add_argument(
'--vocab', metavar='PATH', type=str, required=True,
help=(
"Input layer vocabulary file (one item per line; "
"first line '<unk>')"))
parser.add_argument(
'--output_vocab', metavar='PATH', type=str,
help=(
"Output layer vocabulary file "
"(default: use input layer vocabulary)"))
parser.add_argument(
'--left_context', metavar='INT', type=int, default=3,
help=(
"Size of context vector for left siblings "
"(default: %(default)s)"))
parser.add_argument(
'--right_context', metavar='INT', type=int, default=0,
help=(
"Size of context vector for right siblings "
"(default: %(default)s)"))
parser.add_argument(
'--up_context', metavar='INT', type=int, default=2,
help=(
"Size of context vector for ancestors "
"(default: %(default)s)"))
parser.add_argument(
'--glue_symbol', metavar='STR', type=str, default='Q',
help=(
"Glue symbol. Will be skipped during extraction "
"(default: %(default)s)"))
parser.add_argument(
'--start_symbol', metavar='STR', type=str, default='SSTART',
help=(
"Sentence start symbol. Will be skipped during extraction "
"(default: %(default)s)"))
parser.add_argument(
'--end_symbol', metavar='STR', type=str, default='SEND',
help=(
"Sentence end symbol. Will be skipped during extraction "
"(default: %(default)s)"))
parser.add_argument(
'--ptkvz', action='store_true',
help=(
"Special rule for German dependency trees: "
"concatenate separable verb prefix and verb."))
return parser
def escape_text(s):
s = s.replace('|','&#124;') # factor separator
s = s.replace('[','&#91;') # syntax non-terminal
s = s.replace(']','&#93;') # syntax non-terminal
s = s.replace('\'','&apos;') # xml special character
s = s.replace('"','&quot;') # xml special character
s = s.replace('|', '&#124;') # factor separator
s = s.replace('[', '&#91;') # syntax non-terminal
s = s.replace(']', '&#93;') # syntax non-terminal
s = s.replace('\'', '&apos;') # xml special character
s = s.replace('"', '&quot;') # xml special character
return s
# deterministic heuristic to get head of subtree
def get_head(xml, add_ptkvz):
"""Deterministic heuristic to get head of subtree."""
head = None
preterminal = None
for child in xml:
@ -77,23 +123,38 @@ def get_head(xml, add_ptkvz):
return head, preterminal
def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
def get_syntactic_ngrams(xml, options, vocab, output_vocab,
parent_heads=None, parent_labels=None):
if len(xml):
# skip glue rules
if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
for child in xml:
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
return
# Skip glue rules.
skip_glue_labels = [
options.glue_symbol,
options.start_symbol,
options.end_symbo,
]
if xml.get('label') in skip_glue_labels:
for child in xml:
get_syntactic_ngrams(
child, options, vocab, output_vocab, parent_heads,
parent_labels)
return
# skip virtual nodes
if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
return
# Skip virtual nodes.
skip_virtual_labels = [
'<stop_label>',
'<start_label>',
]
if xml.get('label') in skip_virtual_labels:
return
if not parent_heads:
parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
parent_heads = (
[vocab.get('<root_head>', 0)] * options.up_context)
parent_labels = (
[vocab.get('<root_label>', 0)] * options.up_context)
head, preterminal = get_head(xml, options.ptkvz)
if not head:
@ -119,7 +180,8 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
options.output.write(' '.join(map(str, int_list)) + '\n')
elif options.mode == 'head' and not head == '<dummy_head>':
int_list.append(vocab.get(label, 0))
int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
int_list.append(
output_vocab.get(head, output_vocab.get(preterminal, 0)))
options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(head, 0))
@ -130,28 +192,29 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
if options.right_context:
start = ET.Element('tree')
start2 = ET.Element('tree')
start.set('label','<start_label>')
start2.set('label','XY')
start.set('label', '<start_label>')
start2.set('label', 'XY')
start2.text = '<start_head>'
start.append(start2)
xml.insert(0,start)
xml.insert(0, start)
if options.left_context:
end = ET.Element('tree')
end2 = ET.Element('tree')
end.set('label','<stop_label>')
end2.set('label','XY')
end.set('label', '<stop_label>')
end2.set('label', 'XY')
end2.text = '<stop_head>'
end.append(end2)
xml.append(end)
heads = []
preterminals = []
labels = []
for child in xml:
if not len(child):
# mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
# Mark that the previous sibling is the head of the
# structure (the head/label are not repeated because they're
# also head/label of the parent).
head_child = '<head_head>'
preterminal_child = head_child
child_label = '<head_label>'
@ -166,37 +229,60 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
preterminals.append(preterminal_child)
labels.append(child_label)
heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
heads_idx = [
vocab.get(heads[i], vocab.get(preterminals[i], 0))
for i in range(len(heads))]
labels_idx = [
vocab.get(labels[i], 0)
for i in range(len(labels))]
#ancestor context is same for all children
# Ancestor context is the same for all children.
up_heads = parent_heads[-options.up_context:]
up_labels = parent_labels[-options.up_context:]
for i,child in enumerate(xml):
skip_special_heads = [
'<dummy_head>',
'<head_head>',
'<stop_head>',
'<start_head>',
]
for i, child in enumerate(xml):
# skip some special symbols, but recursively extract n-grams for its children
if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
# Skip some special symbols, but recursively extract n-grams
# for its children.
if options.mode == 'head' and heads[i] in skip_special_heads:
parent_heads.append(vocab.get(heads[i], 0))
parent_labels.append(vocab.get(labels[i], 0))
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
get_syntactic_ngrams(
child, options, vocab, output_vocab, parent_heads,
parent_labels)
parent_heads.pop()
parent_labels.pop()
continue
previous_heads = heads_idx[max(0,i-options.left_context):i]
previous_labels = labels_idx[max(0,i-options.left_context):i]
previous_heads = heads_idx[max(0, i - options.left_context):i]
previous_labels = labels_idx[max(0, i - options.left_context):i]
subsequent_heads = heads_idx[i+1:i+options.right_context+1]
subsequent_labels = labels_idx[i+1:i+options.right_context+1]
subsequent_heads = heads_idx[i + 1:i + options.right_context + 1]
subsequent_labels = labels_idx[i + 1:i + options.right_context + 1]
if len(previous_heads) < options.left_context:
previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
previous_heads = (
[start_head_idx] *
(options.left_context - len(previous_heads)) +
previous_heads)
previous_labels = (
[start_label_idx] *
(options.left_context - len(previous_labels)) +
previous_labels)
if len(subsequent_heads) < options.right_context:
subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
subsequent_heads += (
[stop_head_idx] *
(options.right_context - len(subsequent_heads)))
subsequent_labels += (
[stop_label_idx] *
(options.right_context - len(subsequent_labels)))
int_list = []
int_list.extend(previous_heads)
@ -209,14 +295,19 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.append(output_vocab.get(labels[i], 0))
elif options.mode == 'head':
int_list.append(vocab.get(labels[i], 0))
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
int_list.append(
output_vocab.get(
heads[i], output_vocab.get(preterminals[i], 0)))
options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_heads.append(
vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
get_syntactic_ngrams(
child, options, vocab, output_vocab, parent_heads,
parent_labels)
parent_heads.pop()
parent_labels.pop()
@ -224,15 +315,17 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
def load_vocab(path):
v = {}
for i,line in enumerate(open(path, encoding="UTF-8")):
for i, line in enumerate(open(path, encoding="UTF-8")):
v[line.strip()] = i
return v
def main(options):
vocab = load_vocab(options.vocab)
if options.output_vocab is None:
sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
sys.stderr.write(
"No output vocabulary specified; using input vocabulary.\n")
output_vocab = vocab
else:
output_vocab = load_vocab(options.output_vocab)
@ -275,4 +368,4 @@ if __name__ == '__main__':
parser = create_parser()
options = parser.parse_args()
main(options)
main(options)

View File

@ -9,6 +9,7 @@ import sys
import codecs
import argparse
from collections import Counter
from textwrap import dedent
# hack for python2/3 compatibility
from io import open
@ -19,37 +20,49 @@ try:
except ImportError:
from xml.etree import cElementTree as ET
HELP_TEXT = dedent("""\
generate 5 vocabulary files from parsed corpus in moses XML format
[PREFIX].special: around 40 symbols reserved for RDLM
[PREFIX].preterminals: preterminal symbols
[PREFIX].nonterminals: nonterminal symbols (which are not preterminal)
[PREFIX].terminals: terminal symbols
[PREFIX].all: all of the above
""")
def create_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=HELP_TEXT)
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
help_text += " [PREFIX].preterminals: preterminal symbols\n";
help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
help_text += " [PREFIX].terminals: terminal symbols\n";
help_text += " [PREFIX].all: all of the above\n"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
help='input text (default: standard input).')
parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
help='output prefix (default: "vocab")')
parser.add_argument('--ptkvz', action="store_true",
help='special rule for German dependency trees: attach separable verb prefixes to verb')
parser.add_argument(
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
metavar='PATH',
help="Input text (default: standard input).")
parser.add_argument(
'--output', '-o', type=str, default='vocab', metavar='PREFIX',
help="Output prefix (default: 'vocab')")
parser.add_argument(
'--ptkvz', action="store_true",
help=(
"Special rule for German dependency trees: attach separable "
"verb prefixes to verb."))
return parser
def escape_text(s):
s = s.replace('|','&#124;') # factor separator
s = s.replace('[','&#91;') # syntax non-terminal
s = s.replace(']','&#93;') # syntax non-terminal
s = s.replace('\'','&apos;') # xml special character
s = s.replace('"','&quot;') # xml special character
def escape_text(s):
s = s.replace('|', '&#124;') # factor separator
s = s.replace('[', '&#91;') # syntax non-terminal
s = s.replace(']', '&#93;') # syntax non-terminal
s = s.replace('\'', '&apos;') # xml special character
s = s.replace('"', '&quot;') # xml special character
return s
# deterministic heuristic to get head of subtree
def get_head(xml, args):
"""Deterministic heuristic to get head of subtree."""
head = None
preterminal = None
for child in xml:
@ -67,6 +80,7 @@ def get_head(xml, args):
return head, preterminal
def get_vocab(xml, args):
if len(xml):
@ -88,6 +102,7 @@ def get_vocab(xml, args):
continue
get_vocab(child, args)
def main(args):
global heads
@ -111,10 +126,24 @@ def main(args):
get_vocab(xml, args)
i += 1
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
special_tokens = [
'<unk>',
'<null>',
'<null_label>',
'<null_head>',
'<head_label>',
'<root_label>',
'<start_label>',
'<stop_label>',
'<head_head>',
'<root_head>',
'<start_head>',
'<dummy_head>',
'<stop_head>',
]
for i in range(30):
special_tokens.append('<null_{0}>'.format(i))
special_tokens.append('<null_{0}>'.format(i))
f = open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
@ -158,7 +187,6 @@ def main(args):
f.close()
if __name__ == '__main__':
if sys.version_info < (3, 0):

View File

@ -9,7 +9,6 @@ import subprocess
import sys
import os
import codecs
import copy
# ../bilingual-lm
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
@ -17,143 +16,224 @@ import train_nplm
import extract_vocab
import extract_syntactic_ngrams
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
parser.add_argument(
"--working-dir", dest="working_dir", metavar="PATH")
parser.add_argument(
"--corpus", dest="corpus_stem", metavar="PATH", help="Input file.")
parser.add_argument(
"--nplm-home", dest="nplm_home", metavar="PATH", required=True,
help="Location of NPLM.")
parser.add_argument(
"--epochs", dest="epochs", type=int, metavar="INT",
help="Number of training epochs (default: %(default)s).")
parser.add_argument(
"--up-context-size", dest="up_context_size", type=int, metavar="INT",
help="Size of ancestor context (default: %(default)s).")
parser.add_argument(
"--left-context-size", dest="left_context_size", type=int, metavar="INT",
help="Size of sibling context (left) (default: %(default)s).")
parser.add_argument(
"--right-context-size", dest="right_context_size", type=int,
metavar="INT",
help="Size of sibling context (right) (default: %(default)s).")
parser.add_argument(
"--mode", dest="mode", choices=['head', 'label'], required=True,
help="Type of RDLM to train (both are required for decoding).")
parser.add_argument(
"--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
help="Minibatch size (default: %(default)s).")
parser.add_argument(
"--noise", dest="noise", type=int, metavar="INT",
help="Number of noise samples for NCE (default: %(default)s).")
parser.add_argument(
"--hidden", dest="hidden", type=int, metavar="INT",
help=(
"Size of hidden layer (0 for single hidden layer) "
"(default: %(default)s)"))
parser.add_argument(
"--input-embedding", dest="input_embedding", type=int, metavar="INT",
help="Size of input embedding layer (default: %(default)s).")
parser.add_argument(
"--output-embedding", dest="output_embedding", type=int, metavar="INT",
help="Size of output embedding layer (default: %(default)s).")
parser.add_argument(
"--threads", "-t", dest="threads", type=int, metavar="INT",
help="Number of threads (default: %(default)s).")
parser.add_argument(
"--output-model", dest="output_model", metavar="PATH",
help="Name of output model (default: %(default)s).")
parser.add_argument(
"--output-dir", dest="output_dir", metavar="PATH",
help="Output directory (default: same as working-dir).")
parser.add_argument(
"--config-options-file", dest="config_options_file", metavar="PATH")
parser.add_argument(
"--log-file", dest="log_file", metavar="PATH",
help="Log file to write to (default: %(default)s).")
parser.add_argument(
"--validation-corpus", dest="validation_corpus", metavar="PATH",
help="Validation file (default: %(default)s).")
parser.add_argument(
"--activation-function", dest="activation_fn",
choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
help="Activation function (default: %(default)s).")
parser.add_argument(
"--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
help="Learning rate (default: %(default)s).")
parser.add_argument(
"--input-words-file", dest="input_words_file", metavar="PATH",
help="Input vocabulary (default: %(default)s).")
parser.add_argument(
"--output-words-file", dest="output_words_file", metavar="PATH",
help="Output vocabulary (default: %(default)s).")
parser.add_argument(
"--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
help="Input vocabulary size (default: %(default)s).")
parser.add_argument(
"--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
help="Output vocabulary size (default: %(default)s).")
parser.set_defaults(
working_dir = "working"
,corpus_stem = "train"
,nplm_home = "/home/bhaddow/tools/nplm"
,epochs = 2
,up_context_size = 2
,left_context_size = 3
,right_context_size = 0
,minibatch_size=1000
,noise=100
,hidden=0
,mode='head'
,input_embedding=150
,output_embedding=750
,threads=4
,output_model = "train"
,output_dir = None
,config_options_file = "config"
,log_file = "log"
,validation_corpus = None
,activation_fn = "rectifier"
,learning_rate = 1
,input_words_file = None
,output_words_file = None
,input_vocab_size = 500000
,output_vocab_size = 500000
)
working_dir="working",
corpus_stem="train",
nplm_home="/home/bhaddow/tools/nplm",
epochs=2,
up_context_size=2,
left_context_size=3,
right_context_size=0,
minibatch_size=1000,
noise=100,
hidden=0,
mode='head',
input_embedding=150,
output_embedding=750,
threads=4,
output_model="train",
output_dir=None,
config_options_file="config",
log_file="log",
validation_corpus=None,
activation_fn="rectifier",
learning_rate=1,
input_words_file=None,
output_words_file=None,
input_vocab_size=500000,
output_vocab_size=500000)
def prepare_vocabulary(options):
vocab_prefix = os.path.join(options.working_dir, 'vocab')
extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
extract_vocab.main(extract_vocab_options)
vocab_prefix = os.path.join(options.working_dir, 'vocab')
extract_vocab_options = extract_vocab.create_parser().parse_args(
['--input', options.corpus_stem, '--output', vocab_prefix])
extract_vocab.main(extract_vocab_options)
if options.input_words_file is None:
options.input_words_file = vocab_prefix + '.input'
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()
if options.input_vocab_size:
filtered_vocab = filtered_vocab[:options.input_vocab_size]
open(options.input_words_file,'w').writelines(filtered_vocab)
if options.input_words_file is None:
options.input_words_file = vocab_prefix + '.input'
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()
if options.input_vocab_size:
filtered_vocab = filtered_vocab[:options.input_vocab_size]
open(options.input_words_file, 'w').writelines(filtered_vocab)
if options.output_words_file is None:
options.output_words_file = vocab_prefix + '.output'
if options.mode == 'label':
blacklist = [
'<null',
'<root',
'<start_head',
'<dummy',
'<head_head',
'<stop_head',
]
orig = vocab_prefix + '.special'
filtered_vocab = open(orig).readlines()
orig = vocab_prefix + '.nonterminals'
filtered_vocab += open(orig).readlines()
filtered_vocab = [
word
for word in filtered_vocab
if not any(word.startswith(prefix) for prefix in blacklist)]
if options.output_vocab_size:
filtered_vocab = filtered_vocab[:options.output_vocab_size]
else:
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
open(options.output_words_file, 'w').writelines(filtered_vocab)
if options.output_words_file is None:
options.output_words_file = vocab_prefix + '.output'
if options.mode == 'label':
blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
orig = vocab_prefix + '.special'
filtered_vocab = open(orig).readlines()
orig = vocab_prefix + '.nonterminals'
filtered_vocab += open(orig).readlines()
filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
if options.output_vocab_size:
filtered_vocab = filtered_vocab[:options.output_vocab_size]
else:
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
open(options.output_words_file,'w').writelines(filtered_vocab)
def main(options):
options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
if options.mode == 'head':
options.ngram_size += 2
elif options.mode == 'label':
options.ngram_size += 1
options.ngram_size = (
2 * options.up_context_size +
2 * options.left_context_size +
2 * options.right_context_size
)
if options.mode == 'head':
options.ngram_size += 2
elif options.mode == 'label':
options.ngram_size += 1
if options.input_words_file is None or options.output_words_file is None:
sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
prepare_vocabulary(options)
if options.input_words_file is None or options.output_words_file is None:
sys.stderr.write(
"Either input vocabulary or output vocabulary not specified: "
"extracting vocabulary from training text.\n")
prepare_vocabulary(options)
extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
'--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
'--vocab', options.input_words_file,
'--output_vocab', options.output_words_file,
'--right_context', str(options.right_context_size),
'--left_context', str(options.left_context_size),
'--up_context', str(options.up_context_size),
'--mode', options.mode
])
sys.stderr.write('extracting syntactic n-grams\n')
extract_syntactic_ngrams.main(extract_options)
if options.validation_corpus:
extract_options.input = open(options.validation_corpus)
options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
extract_options.output = open(options.validation_file + '.numberized', 'w')
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
extract_options = extract_syntactic_ngrams.create_parser().parse_args([
'--input', options.corpus_stem,
'--output', os.path.join(
options.working_dir,
os.path.basename(options.corpus_stem) + '.numberized'),
'--vocab', options.input_words_file,
'--output_vocab', options.output_words_file,
'--right_context', str(options.right_context_size),
'--left_context', str(options.left_context_size),
'--up_context', str(options.up_context_size),
'--mode', options.mode
])
sys.stderr.write('extracting syntactic n-grams\n')
extract_syntactic_ngrams.main(extract_options)
extract_options.output.close()
sys.stderr.write('training neural network\n')
train_nplm.main(options)
if options.validation_corpus:
extract_options.input = open(options.validation_corpus)
options.validation_file = os.path.join(
options.working_dir, os.path.basename(options.validation_corpus))
extract_options.output = open(
options.validation_file + '.numberized', 'w')
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
extract_syntactic_ngrams.main(extract_options)
extract_options.output.close()
sys.stderr.write('training neural network\n')
train_nplm.main(options)
sys.stderr.write('averaging null words\n')
ret = subprocess.call([
os.path.join(sys.path[0], 'average_null_embedding.py'),
options.nplm_home,
os.path.join(
options.output_dir,
options.output_model + '.model.nplm.' + str(options.epochs)),
os.path.join(
options.working_dir,
os.path.basename(options.corpus_stem) + '.numberized'),
os.path.join(options.output_dir, options.output_model + '.model.nplm')
])
if ret:
raise Exception("averaging null words failed")
sys.stderr.write('averaging null words\n')
ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
options.nplm_home,
os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
os.path.join(options.output_dir, options.output_model + '.model.nplm')
])
if ret:
raise Exception("averaging null words failed")
if __name__ == "__main__":
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
options = parser.parse_args()
main(options)
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
options = parser.parse_args()
main(options)

View File

@ -2,42 +2,76 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
# which not all parsers produce.
"""
Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
Moses XML format.
# usage: conll2mosesxml.py [--brackets] < input_file > output_file
Note that the structure is built based on fields 9 and 10 (projective HEAD
and RELATION), which not all parsers produce.
Usage: conll2mosesxml.py [--brackets] < input_file > output_file
"""
from __future__ import print_function, unicode_literals
import sys
import re
import codecs
from collections import namedtuple,defaultdict
from collections import (
namedtuple,
defaultdict,
)
from lxml import etree as ET
Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
Word = namedtuple(
'Word',
['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])
def main(output_format='xml'):
sentence = []
for line in sys.stdin:
# process sentence
# Process sentence.
if line == "\n":
sentence.insert(0,[])
sentence.insert(0, [])
if is_projective(sentence):
write(sentence,output_format)
write(sentence, output_format)
else:
sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
sys.stderr.write(
' '.join(w.word for w in sentence[1:]) + '\n')
sys.stdout.write('\n')
sentence = []
continue
try:
pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
except ValueError: # word may be unicode whitespace
pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
(
pos,
word,
lemma,
tag,
tag2,
morph,
head,
func,
proj_head,
proj_func,
) = line.split()
except ValueError: # Word may be unicode whitespace.
(
pos,
word,
lemma,
tag,
tag2,
morph,
head,
func,
proj_head,
proj_func,
) = re.split(' *\t*', line.strip())
word = escape_special_chars(word)
lemma = escape_special_chars(lemma)
@ -46,17 +80,20 @@ def main(output_format='xml'):
proj_head = head
proj_func = func
sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
sentence.append(
Word(
int(pos), word, lemma, tag2, int(head), func, int(proj_head),
proj_func))
# this script performs the same escaping as escape-special-chars.perl in Moses.
# most of it is done in function write(), but quotation marks need to be processed first
# This script performs the same escaping as escape-special-chars.perl in
# Moses. Most of it is done in function write(), but quotation marks need
# to be processed first.
def escape_special_chars(line):
line = line.replace('\'','&apos;') # xml
line = line.replace('"','&quot;') # xml
line = line.replace('[','&#91;') # syntax non-terminal
line = line.replace(']','&#93;') # syntax non-terminal
line = line.replace('\'', '&apos;') # xml
line = line.replace('"', '&quot;') # xml
line = line.replace('[', '&#91;') # syntax non-terminal
line = line.replace(']', '&#93;') # syntax non-terminal
return line
@ -64,7 +101,7 @@ def escape_special_chars(line):
# make a check if structure is projective
def is_projective(sentence):
dominates = defaultdict(set)
for i,w in enumerate(sentence):
for i, w in enumerate(sentence):
dominates[i].add(i)
if not i:
continue
@ -77,7 +114,7 @@ def is_projective(sentence):
for i in dominates:
dependents = dominates[i]
if max(dependents) - min(dependents) != len(dependents)-1:
if max(dependents) - min(dependents) != len(dependents) - 1:
sys.stderr.write("error: non-projective structure.\n")
return False
return True
@ -86,24 +123,28 @@ def is_projective(sentence):
def write(sentence, output_format='xml'):
if output_format == 'xml':
tree = create_subtree(0,sentence)
out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
tree = create_subtree(0, sentence)
out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')
if output_format == 'brackets':
out = create_brackets(0,sentence)
out = create_brackets(0, sentence)
out = out.replace('|','&#124;') # factor separator
out = out.replace('|', '&#124;') # factor separator
out = out.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
out = out.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
out = out.replace('&amp;#91;','&#91;') # lxml is buggy if input is escaped
out = out.replace('&amp;#93;','&#93;') # lxml is buggy if input is escaped
# lxml is buggy if input is escaped:
out = out.replace('&amp;apos;', '&apos;')
# lxml is buggy if input is escaped:
out = out.replace('&amp;quot;', '&quot;')
# lxml is buggy if input is escaped:
out = out.replace('&amp;#91;', '&#91;')
# lxml is buggy if input is escaped:
out = out.replace('&amp;#93;', '&#93;')
print(out)
# write node in Moses XML format
def create_subtree(position, sentence):
def create_subtree(position, sentence):
""""Write node in Moses XML format."""
element = ET.Element('tree')
if position:
@ -111,7 +152,7 @@ def create_subtree(position, sentence):
else:
element.set('label', 'sent')
for i in range(1,position):
for i in range(1, position):
if sentence[i].proj_head == position:
element.append(create_subtree(i, sentence))
@ -144,7 +185,7 @@ def create_brackets(position, sentence):
else:
element = "[ sent "
for i in range(1,position):
for i in range(1, position):
if sentence[i].proj_head == position:
element += create_brackets(i, sentence)
@ -167,7 +208,7 @@ def create_brackets(position, sentence):
return element
if __name__ == '__main__':
if sys.version_info < (3,0,0):
if sys.version_info < (3, 0, 0):
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)

View File

@ -10,17 +10,21 @@ import codecs
from lxml import etree as ET
def escape(word):
word = word.replace('|','&#124;') # factor separator
word = word.replace('[','&#91;') # syntax non-terminal
word = word.replace(']','&#93;') # syntax non-terminal
word = word.replace('\'','&apos;')
word = word.replace('\"','&quot;')
# Factor separator:
word = word.replace('|', '&#124;')
# Syntax non-terminal:
word = word.replace('[', '&#91;')
# Syntax non-terminal:
word = word.replace(']', '&#93;')
word = word.replace('\'', '&apos;')
word = word.replace('\"', '&quot;')
return word
def make_brackets(xml):
def make_brackets(xml):
out = ' [' + xml.get('label')
if xml.text and xml.text.strip():