mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Fix more Python lint.
Most of the complaints fixed here were from Pocketlint, but many were also from Syntastic the vim plugin.
This commit is contained in:
parent
c07ade8142
commit
61162dd242
@ -1,53 +1,48 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
#
|
||||
# Version of ConfigParser which accepts default values
|
||||
#
|
||||
"""Version of ConfigParser which accepts default values."""
|
||||
|
||||
|
||||
import ConfigParser
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self,filename):
|
||||
self.config = ConfigParser.SafeConfigParser()
|
||||
cfh = open(filename)
|
||||
self.config.readfp(cfh)
|
||||
cfh.close()
|
||||
"""Version of ConfigParser which accepts default values."""
|
||||
|
||||
def get(self,section,name,default=None):
|
||||
if default == None or self.config.has_option(section,name):
|
||||
return self.config.get(section,name)
|
||||
else:
|
||||
return default
|
||||
def __init__(self, filename):
|
||||
self.config = ConfigParser.SafeConfigParser()
|
||||
cfh = open(filename)
|
||||
self.config.readfp(cfh)
|
||||
cfh.close()
|
||||
|
||||
def getint(self,section,name,default=None):
|
||||
if default == None or self.config.has_option(section,name):
|
||||
return self.config.getint(section,name)
|
||||
else:
|
||||
return default
|
||||
def get(self, section, name, default=None):
|
||||
if default is None or self.config.has_option(section, name):
|
||||
return self.config.get(section, name)
|
||||
else:
|
||||
return default
|
||||
|
||||
def getint(self, section, name, default=None):
|
||||
if default is None or self.config.has_option(section, name):
|
||||
return self.config.getint(section, name)
|
||||
else:
|
||||
return default
|
||||
|
||||
def getboolean(self,section,name,default=None):
|
||||
if default == None or self.config.has_option(section,name):
|
||||
return self.config.getboolean(section,name)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
def getfloat(self,section,name,default=None):
|
||||
if default == None or self.config.has_option(section,name):
|
||||
return self.config.getfloat(section,name)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
def __str__(self):
|
||||
ret = ""
|
||||
for section in self.config.sections():
|
||||
for option in self.config.options(section):
|
||||
ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option))
|
||||
return ret
|
||||
|
||||
def getboolean(self, section, name, default=None):
|
||||
if default is None or self.config.has_option(section, name):
|
||||
return self.config.getboolean(section, name)
|
||||
else:
|
||||
return default
|
||||
|
||||
def getfloat(self, section, name, default=None):
|
||||
if default is None or self.config.has_option(section, name):
|
||||
return self.config.getfloat(section, name)
|
||||
else:
|
||||
return default
|
||||
|
||||
def __str__(self):
|
||||
ret = ""
|
||||
for section in self.config.sections():
|
||||
for option in self.config.options(section):
|
||||
ret = ret + "%s:%s = %s\n" % (
|
||||
section, option, self.config.get(section, option))
|
||||
return ret
|
||||
|
@ -1,156 +1,171 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
#
|
||||
# Filter a parallel corpus
|
||||
#
|
||||
"""Filter a parallel corpus."""
|
||||
|
||||
|
||||
import heapq
|
||||
import logging
|
||||
import math
|
||||
import optparse
|
||||
import random
|
||||
import sys
|
||||
|
||||
from defaultconfig import Config
|
||||
|
||||
logging.basicConfig(format = "%(asctime)-15s %(message)s")
|
||||
|
||||
logging.basicConfig(format="%(asctime)-15s %(message)s")
|
||||
log = logging.getLogger("filter")
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
class FilterStrategy(object):
|
||||
def __init__(self,config):
|
||||
pass
|
||||
|
||||
def filter(self,source,target):
|
||||
return True
|
||||
class FilterStrategy(object):
|
||||
def __init__(self, config):
|
||||
pass
|
||||
|
||||
def filter(self, source, target):
|
||||
return True
|
||||
|
||||
|
||||
class RandomFilterStrategy(FilterStrategy):
|
||||
def __init__(self,config):
|
||||
self.threshold = config.getfloat("random", "threshold", 0.1)
|
||||
random.seed()
|
||||
def __init__(self, config):
|
||||
self.threshold = config.getfloat("random", "threshold", 0.1)
|
||||
random.seed()
|
||||
|
||||
def filter(self, source, target):
|
||||
return random.random() < self.threshold
|
||||
def filter(self, source, target):
|
||||
return random.random() < self.threshold
|
||||
|
||||
|
||||
class ScoreFilterStrategy(FilterStrategy):
|
||||
"""Filter strategy that is based on a file with sentence scores. There are three
|
||||
possible ways of specifying how to filter:
|
||||
i) threshold - filter all sentence pairs whose score is less than the threshold
|
||||
ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences
|
||||
"""Filter strategy that is based on a file with sentence scores.
|
||||
|
||||
There are three possible ways of specifying how to filter:
|
||||
i) threshold - filter all sentence pairs whose score is less than the
|
||||
threshold.
|
||||
ii) proportion - filter all but a certain proportion (eg a tenth) of the
|
||||
sentences.
|
||||
iii) count - filter all but a given count of the sentences.
|
||||
"""
|
||||
def __init__(self,config):
|
||||
section = "score"
|
||||
self.score_file = config.get(section,"score_file")
|
||||
self.ignore_score = config.get(section, "ignore_score", "99999")
|
||||
option_names = ("threshold", "proportion", "count")
|
||||
options = [config.config.has_option(section,o) for o in option_names]
|
||||
if sum(options) != 1:
|
||||
raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names))
|
||||
if options[0]:
|
||||
# threshold
|
||||
self.threshold = config.getfloat(section,option_names[0])
|
||||
else:
|
||||
# proportion or count
|
||||
if options[2]:
|
||||
count = config.getint(section,option_names[2])
|
||||
else:
|
||||
# need to count entries
|
||||
count = 0
|
||||
ignore_count = 0
|
||||
for line in open(self.score_file):
|
||||
if line[:-1] != self.ignore_score:
|
||||
count = count + 1
|
||||
else:
|
||||
ignore_count = ignore_count + 1
|
||||
count = int(count * config.getfloat(section,option_names[1]))
|
||||
log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count))
|
||||
# Find the threshold
|
||||
self.threshold = sorted(\
|
||||
[float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count]
|
||||
#self.threshold = heapq.nlargest(count, \
|
||||
# [float(line[:-1]) for line in open(self.score_file)])[-1]
|
||||
|
||||
def __init__(self, config):
|
||||
section = "score"
|
||||
self.score_file = config.get(section, "score_file")
|
||||
self.ignore_score = config.get(section, "ignore_score", "99999")
|
||||
option_names = ("threshold", "proportion", "count")
|
||||
options = [config.config.has_option(section, o) for o in option_names]
|
||||
if sum(options) != 1:
|
||||
raise RuntimeError(
|
||||
"Must specify exactly one of %s for score filter"
|
||||
% str(option_names))
|
||||
if options[0]:
|
||||
# Threshold.
|
||||
self.threshold = config.getfloat(section, option_names[0])
|
||||
else:
|
||||
# proportion or count
|
||||
if options[2]:
|
||||
count = config.getint(section, option_names[2])
|
||||
else:
|
||||
# Need to count entries.
|
||||
count = 0
|
||||
ignore_count = 0
|
||||
for line in open(self.score_file):
|
||||
if line[:-1] != self.ignore_score:
|
||||
count += 1
|
||||
else:
|
||||
ignore_count = ignore_count + 1
|
||||
count = int(count * config.getfloat(section, option_names[1]))
|
||||
log.info(
|
||||
"Retaining at least %d entries and ignoring %d"
|
||||
% (count, ignore_count))
|
||||
# Find the threshold.
|
||||
self.threshold = sorted([
|
||||
float(line[:-1])
|
||||
for line in open(self.score_file)],
|
||||
reverse=True)[ignore_count + count]
|
||||
# import heapq
|
||||
# self.threshold = heapq.nlargest(
|
||||
# count,
|
||||
# [float(line[:-1]) for line in open(self.score_file)])[-1]
|
||||
|
||||
self.sfh = open(self.score_file)
|
||||
log.info("Thresholding scores at " + str(self.threshold))
|
||||
self.sfh = open(self.score_file)
|
||||
log.info("Thresholding scores at " + str(self.threshold))
|
||||
|
||||
def filter(self, source, target):
|
||||
score = self.sfh.readline()
|
||||
if not score:
|
||||
raise RuntimeError("score file truncated")
|
||||
return (
|
||||
score[:-1] == self.ignore_score or
|
||||
float(score[:-1]) >= self.threshold
|
||||
)
|
||||
|
||||
def filter(self,source,target):
|
||||
score = self.sfh.readline()
|
||||
if not score:
|
||||
raise RuntimeError("score file truncated")
|
||||
return score[:-1] == self.ignore_score or float(score[:-1]) >= self.threshold
|
||||
|
||||
|
||||
def main():
|
||||
parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file")
|
||||
(options,args) = parser.parse_args()
|
||||
if len(args) < 1:
|
||||
parser.error("No configuration file specified")
|
||||
parser = optparse.OptionParser(usage="Usage: %prog [options] config-file")
|
||||
(options, args) = parser.parse_args()
|
||||
if len(args) < 1:
|
||||
parser.error("No configuration file specified")
|
||||
|
||||
log.info("Loading configuration from " + args[0])
|
||||
config = Config(args[0])
|
||||
log.debug("Configuration:\n" + str(config))
|
||||
log.info("Loading configuration from " + args[0])
|
||||
config = Config(args[0])
|
||||
log.debug("Configuration:\n" + str(config))
|
||||
|
||||
# Required general parameters
|
||||
source_lang = config.get("general", "source_language")
|
||||
target_lang = config.get("general", "target_language")
|
||||
input_stem = config.get("general", "input_stem")
|
||||
output_stem = config.get("general", "output_stem")
|
||||
strategy = config.get("general", "strategy", "")
|
||||
# Required general parameters
|
||||
source_lang = config.get("general", "source_language")
|
||||
target_lang = config.get("general", "target_language")
|
||||
input_stem = config.get("general", "input_stem")
|
||||
output_stem = config.get("general", "output_stem")
|
||||
strategy = config.get("general", "strategy", "")
|
||||
|
||||
# Optional general parameters
|
||||
alignment_stem = config.get("general", "alignment_stem", "")
|
||||
alignment_type = config.get("general", "alignment_type", "grow-diag-final-and")
|
||||
domain_file_in = config.get("general", "domain_file", "")
|
||||
domain_file_out = config.get("general", "domain_file_out", "")
|
||||
# Optional general parameters
|
||||
alignment_stem = config.get("general", "alignment_stem", "")
|
||||
alignment_type = config.get(
|
||||
"general", "alignment_type", "grow-diag-final-and")
|
||||
domain_file_in = config.get("general", "domain_file", "")
|
||||
domain_file_out = config.get("general", "domain_file_out", "")
|
||||
|
||||
strategy_class = globals()[strategy + "FilterStrategy"]
|
||||
strategy = strategy_class(config)
|
||||
strategy_class = globals()[strategy + "FilterStrategy"]
|
||||
strategy = strategy_class(config)
|
||||
|
||||
source_input_fh = open(input_stem + "." + source_lang)
|
||||
target_input_fh = open(input_stem + "." + target_lang)
|
||||
source_output_fh = open(output_stem + "." + source_lang, "w")
|
||||
target_output_fh = open(output_stem + "." + target_lang, "w")
|
||||
source_input_fh = open(input_stem + "." + source_lang)
|
||||
target_input_fh = open(input_stem + "." + target_lang)
|
||||
source_output_fh = open(output_stem + "." + source_lang, "w")
|
||||
target_output_fh = open(output_stem + "." + target_lang, "w")
|
||||
|
||||
alignment_input_fh = None
|
||||
alignment_output_fh = None
|
||||
if alignment_stem:
|
||||
alignment_input_fh = open(alignment_stem + "." + alignment_type)
|
||||
alignment_output_fh = open(output_stem + "." + alignment_type,"w")
|
||||
alignment_input_fh = None
|
||||
alignment_output_fh = None
|
||||
if alignment_stem:
|
||||
alignment_input_fh = open(alignment_stem + "." + alignment_type)
|
||||
alignment_output_fh = open(output_stem + "." + alignment_type, "w")
|
||||
|
||||
domain_boundaries = {}
|
||||
if domain_file_in:
|
||||
dfh = open(domain_file_in)
|
||||
for line in dfh:
|
||||
line_no,name = line[:-1].split()
|
||||
domain_boundaries[int(line_no)] = name
|
||||
|
||||
domain_output_fh = None
|
||||
if domain_file_out:
|
||||
domain_output_fh = open(domain_file_out, "w")
|
||||
domain_boundaries = {}
|
||||
if domain_file_in:
|
||||
dfh = open(domain_file_in)
|
||||
for line in dfh:
|
||||
line_no, name = line[:-1].split()
|
||||
domain_boundaries[int(line_no)] = name
|
||||
|
||||
#log.info(str(domain_boundaries))
|
||||
domain_output_fh = None
|
||||
if domain_file_out:
|
||||
domain_output_fh = open(domain_file_out, "w")
|
||||
|
||||
# log.info(str(domain_boundaries))
|
||||
|
||||
retained = 0
|
||||
line_no = 0
|
||||
for source_line in source_input_fh:
|
||||
target_line = target_input_fh.readline()
|
||||
if alignment_input_fh:
|
||||
align_line = alignment_input_fh.readline()
|
||||
if strategy.filter(source_line, target_line):
|
||||
retained = retained + 1
|
||||
print>>source_output_fh, source_line,
|
||||
print>>target_output_fh, target_line,
|
||||
if alignment_input_fh:
|
||||
print>>alignment_output_fh, align_line,
|
||||
line_no = line_no + 1
|
||||
# Check if this is a domain boundary.
|
||||
if domain_boundaries and line_no in domain_boundaries:
|
||||
print >>domain_output_fh, (
|
||||
"%d %s" % (retained, domain_boundaries[line_no]))
|
||||
log.info("Lines retained: %d", retained)
|
||||
|
||||
retained = 0
|
||||
line_no = 0
|
||||
for source_line in source_input_fh:
|
||||
target_line = target_input_fh.readline()
|
||||
if alignment_input_fh:
|
||||
align_line = alignment_input_fh.readline()
|
||||
if strategy.filter(source_line,target_line):
|
||||
retained = retained + 1
|
||||
print>>source_output_fh, source_line,
|
||||
print>>target_output_fh, target_line,
|
||||
if alignment_input_fh:
|
||||
print>>alignment_output_fh, align_line,
|
||||
line_no = line_no + 1
|
||||
# check if this is a domain boundary
|
||||
if domain_boundaries and domain_boundaries.has_key(line_no):
|
||||
print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no])
|
||||
log.info("Lines retained: %d" % retained)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -2,73 +2,73 @@
|
||||
# compute Bleu scores with confidence intervals via boostrap resampling
|
||||
# written by Ulrich Germann
|
||||
|
||||
import math,sys,os
|
||||
from argparse import ArgumentParser
|
||||
from operator import itemgetter
|
||||
from random import randint
|
||||
from operator import itemgetter
|
||||
import math
|
||||
import os
|
||||
from random import randint
|
||||
import sys
|
||||
|
||||
def count_ngrams(snt,max_n):
|
||||
|
||||
def count_ngrams(snt, max_n):
|
||||
"""
|
||||
Return a dictionary of ngram counts (up to length /max_n/)
|
||||
for sentence (list of words) /snt/.
|
||||
Return a dictionary of ngram counts (up to length /max_n/)
|
||||
for sentence (list of words) /snt/.
|
||||
"""
|
||||
ret = {}
|
||||
for i in xrange(len(snt)):
|
||||
for k in xrange(i+1,min(i+max_n+1,len(snt)+1)):
|
||||
for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
|
||||
key = tuple(snt[i:k])
|
||||
ret[key] = ret.get(key,0) + 1
|
||||
pass
|
||||
pass
|
||||
ret[key] = ret.get(key, 0) + 1
|
||||
return ret
|
||||
|
||||
def max_counts(ng1,ng2):
|
||||
|
||||
def max_counts(ng1, ng2):
|
||||
"""
|
||||
Return a dicitonary of ngram counts such that
|
||||
Return a dicitonary of ngram counts such that
|
||||
each count is the greater of the two individual counts
|
||||
for each ngram in the input ngram count dictionaries
|
||||
for each ngram in the input ngram count dictionaries
|
||||
/ng1/ and /ng2/.
|
||||
"""
|
||||
ret = ng1.copy()
|
||||
for k,v in ng2.items():
|
||||
ret[k] = max(ret.get(k,0),v)
|
||||
pass
|
||||
for k, v in ng2.items():
|
||||
ret[k] = max(ret.get(k, 0), v)
|
||||
return ret
|
||||
|
||||
def ng_hits(hyp,ref,max_n):
|
||||
|
||||
def ng_hits(hyp, ref, max_n):
|
||||
"""
|
||||
return a list of ngram counts such that each ngram count
|
||||
is the minimum of the counts in hyp and ref, up to ngram
|
||||
length /max_n/
|
||||
Return a list of ngram counts such that each ngram count
|
||||
is the minimum of the counts in hyp and ref, up to ngram
|
||||
length /max_n/.
|
||||
"""
|
||||
ret = [0 for i in xrange(max_n)]
|
||||
for ng,cnt in hyp.items():
|
||||
for ng, cnt in hyp.items():
|
||||
k = ng
|
||||
if len(k) <= max_n:
|
||||
ret[len(k)-1] += min(cnt,ref.get(ng,0))
|
||||
pass
|
||||
pass
|
||||
ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
|
||||
return ret
|
||||
|
||||
|
||||
class BleuScore:
|
||||
def __init__(self,hyp,ref,max_n=4,bootstrap=1000):
|
||||
# print len(hyp.ngrams),len(ref.ngrams),"X"
|
||||
self.hits = [ng_hits(hyp.ngrams[i],ref.ngrams[i],max_n)
|
||||
for i in xrange(len(hyp.ngrams))]
|
||||
self.max_n = max_n
|
||||
self.hyp = hyp
|
||||
self.ref = ref
|
||||
self.lower = None
|
||||
self.upper = None
|
||||
def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
|
||||
# print len(hyp.ngrams), len(ref.ngrams), "X"
|
||||
self.hits = [
|
||||
ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
|
||||
for i in xrange(len(hyp.ngrams))]
|
||||
self.max_n = max_n
|
||||
self.hyp = hyp
|
||||
self.ref = ref
|
||||
self.lower = None
|
||||
self.upper = None
|
||||
self.median = None
|
||||
self.bootstrap = [self.score([randint(0,len(hyp.snt)-1) for s in hyp.snt])
|
||||
for i in xrange(1000)]
|
||||
self.bootstrap = [
|
||||
self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt])
|
||||
for i in xrange(1000)]
|
||||
self.bootstrap.sort()
|
||||
self.actual = self.score([i for i in xrange(len(hyp.snt))])
|
||||
return
|
||||
|
||||
def score(self,sample):
|
||||
hits = [0 for i in xrange(self.max_n)]
|
||||
|
||||
def score(self, sample):
|
||||
hits = [0 for i in xrange(self.max_n)]
|
||||
self.hyplen = 0
|
||||
self.reflen = 0
|
||||
for i in sample:
|
||||
@ -76,94 +76,89 @@ class BleuScore:
|
||||
self.reflen += len(self.ref.snt[i])
|
||||
for n in xrange(self.max_n):
|
||||
hits[n] += self.hits[i][n]
|
||||
pass
|
||||
pass
|
||||
self.prec = [float(hits[n])/(self.hyplen-n*len(sample))
|
||||
self.prec = [float(hits[n]) / (self.hyplen - n * len(sample))
|
||||
for n in xrange(self.max_n)]
|
||||
ret = sum([math.log(x) for x in self.prec])/self.max_n
|
||||
self.BP = min(1,math.exp(1.-float(self.reflen)/float(self.hyplen)))
|
||||
ret = sum([math.log(x) for x in self.prec]) / self.max_n
|
||||
self.BP = min(
|
||||
1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
|
||||
ret += math.log(self.BP)
|
||||
return math.exp(ret)
|
||||
|
||||
|
||||
|
||||
class Document:
|
||||
def __init__(self,fname=None):
|
||||
def __init__(self, fname=None):
|
||||
self.fname = fname
|
||||
if fname:
|
||||
self.snt = [line.strip().split() for line in open(fname)]
|
||||
self.ngrams = [count_ngrams(snt,4) for snt in self.snt]
|
||||
self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
|
||||
else:
|
||||
self.snt = None
|
||||
self.ngrams = None
|
||||
pass
|
||||
return
|
||||
|
||||
def merge(self,R):
|
||||
def merge(self, R):
|
||||
self.fname = "multi-ref"
|
||||
self.ngrams = [x for x in R[0].ngrams]
|
||||
self.snt = [x for x in R[0].snt]
|
||||
for i in xrange(len(R[0].ngrams)):
|
||||
for k in xrange(1,len(R)):
|
||||
self.ngrams[i] = max_counts(self.ngrams[i],R[k].ngrams[i])
|
||||
pass
|
||||
pass
|
||||
return
|
||||
for k in xrange(1, len(R)):
|
||||
self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
|
||||
|
||||
def update(self,hyp,R):
|
||||
for i in xrange(len(hyp.snt)):
|
||||
clen = len(hyp.snt[i])
|
||||
def update(self, hyp, R):
|
||||
for i, hyp_snt in enumerate(hyp.snt):
|
||||
clen = len(hyp_snt)
|
||||
K = 0
|
||||
for k in xrange(1,len(R)):
|
||||
assert len(R[k].snt) == len(hyp.snt),\
|
||||
"Mismatch in numer of sentences " +\
|
||||
"between reference and candidate"
|
||||
if abs(len(R[k].snt[i]) - clen) == abs(len(R[K].snt[i]) - clen):
|
||||
if len(R[k].snt[i]) < len(R[K].snt[i]):
|
||||
for k in xrange(1, len(R)):
|
||||
k_snt = R[k].snt[i]
|
||||
assert len(R[k].snt) == len(hyp.snt), (
|
||||
"Mismatch in number of sentences " +
|
||||
"between reference and candidate")
|
||||
if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
|
||||
if len(k_snt) < len(R[K].snt[i]):
|
||||
K = k
|
||||
pass
|
||||
pass
|
||||
elif abs(len(R[k].snt[i]) - clen) < abs(len(R[K].snt[i]) - clen):
|
||||
elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
|
||||
K = k
|
||||
pass
|
||||
pass
|
||||
self.snt[i] = R[K].snt[i]
|
||||
pass
|
||||
return
|
||||
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argparser = ArgumentParser()
|
||||
argparser.add_argument("-r","--ref",nargs='+',help="reference translation(s)")
|
||||
argparser.add_argument("-c","--cand",nargs='+',help="candidate translations")
|
||||
argparser.add_argument("-i","--individual",action='store_true',
|
||||
help="compute BLEU scores for individual references")
|
||||
argparser.add_argument("-b","--bootstrap",type=int,default=1000,
|
||||
help="sample size for bootstrap resampling")
|
||||
argparser.add_argument("-a","--alpha",help="1-alpha = confidence interval",type=float,default=.05)
|
||||
argparser.add_argument(
|
||||
"-r", "--ref", nargs='+', help="Reference translation(s).")
|
||||
argparser.add_argument(
|
||||
"-c", "--cand", nargs='+', help="Candidate translations.")
|
||||
argparser.add_argument(
|
||||
"-i", "--individual", action='store_true',
|
||||
help="Compute BLEU scores for individual references.")
|
||||
argparser.add_argument(
|
||||
"-b", "--bootstrap", type=int, default=1000,
|
||||
help="Sample size for bootstrap resampling.")
|
||||
argparser.add_argument(
|
||||
"-a", "--alpha", type=float, default=.05,
|
||||
help="1-alpha = confidence interval.")
|
||||
args = argparser.parse_args(sys.argv[1:])
|
||||
R = [ Document(fname) for fname in args.ref]
|
||||
C = [ Document(fname) for fname in args.cand]
|
||||
Rx = Document() # for multi-reference BLEU
|
||||
R = [Document(fname) for fname in args.ref]
|
||||
C = [Document(fname) for fname in args.cand]
|
||||
Rx = Document() # for multi-reference BLEU
|
||||
Rx.merge(R)
|
||||
for c in C:
|
||||
# compute multi-reference BLEU
|
||||
Rx.update(c,R)
|
||||
bleu = BleuScore(c,Rx,bootstrap=args.bootstrap)
|
||||
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s"%\
|
||||
(100*bleu.actual,
|
||||
os.path.basename(Rx.fname),
|
||||
100*bleu.bootstrap[int((args.alpha/2)*args.bootstrap)],
|
||||
100*bleu.bootstrap[int((1-(args.alpha/2))*args.bootstrap)],
|
||||
100*bleu.bootstrap[int(.5*args.bootstrap)],
|
||||
c.fname) # os.path.basename(c.fname))
|
||||
Rx.update(c, R)
|
||||
bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
|
||||
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
|
||||
100 * bleu.actual,
|
||||
os.path.basename(Rx.fname),
|
||||
100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
|
||||
100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
|
||||
100 * bleu.bootstrap[int(.5 * args.bootstrap)],
|
||||
c.fname) # os.path.basename(c.fname))
|
||||
|
||||
if args.individual:
|
||||
for r in R:
|
||||
bleu = BleuScore(c,r,bootstrap=args.bootstrap)
|
||||
print " %5.2f %s"%(100*bleu.actual,os.path.basename(r.fname))
|
||||
# print bleu.prec,bleu.hyplen,bleu.reflen,bleu.BP
|
||||
pass
|
||||
pass
|
||||
bleu = BleuScore(c, r, bootstrap=args.bootstrap)
|
||||
print " %5.2f %s" % (
|
||||
100 * bleu.actual, os.path.basename(r.fname))
|
||||
# print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
|
||||
|
||||
# print [sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) for n in xrange(4)]
|
||||
pass
|
||||
# print [
|
||||
# sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
|
||||
# for n in xrange(4)]
|
||||
|
@ -1,237 +1,225 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Python utilities for moses
|
||||
#
|
||||
# This package mostly wraps standard Moses utilities into pipes.
|
||||
#
|
||||
# Written by Ulrich Germann
|
||||
#
|
||||
# This package borrows from scripts written by Christian Buck
|
||||
#
|
||||
# The package assumes that there is a complete moses installation
|
||||
# (including scripts) under one root directory,
|
||||
# e.g., via
|
||||
# bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
|
||||
# By default, this root directory is "${HOME}/moses".
|
||||
"""
|
||||
Python utilities for moses
|
||||
|
||||
This package mostly wraps standard Moses utilities into pipes.
|
||||
|
||||
Written by Ulrich Germann
|
||||
|
||||
This package borrows from scripts written by Christian Buck
|
||||
|
||||
The package assumes that there is a complete moses installation
|
||||
(including scripts) under one root directory,
|
||||
e.g., via ::
|
||||
bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses
|
||||
By default, this root directory is "${HOME}/moses".
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import xmlrpclib
|
||||
from subprocess import (
|
||||
PIPE,
|
||||
Popen,
|
||||
)
|
||||
|
||||
|
||||
moses_root = os.environ.get('MOSES_ROOT', os.environ.get('HOME') + "/moses")
|
||||
|
||||
import xmlrpclib,datetime,argparse,time,os,sys
|
||||
from subprocess import *
|
||||
from unicodedata import normalize
|
||||
|
||||
moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses")
|
||||
|
||||
class ProcessWrapper:
|
||||
|
||||
def __init__(self,cmd=[]):
|
||||
self.process = None
|
||||
self.cmd = cmd
|
||||
return
|
||||
def __init__(self, cmd=[]):
|
||||
self.process = None
|
||||
self.cmd = cmd
|
||||
|
||||
def start(self, stdin=PIPE, stdout=PIPE):
|
||||
if self.process:
|
||||
raise Exception("Process is already running")
|
||||
self.process = Popen(self.cmd, stdin = stdin, stdout = stdout)
|
||||
return
|
||||
def start(self, stdin=PIPE, stdout=PIPE):
|
||||
if self.process:
|
||||
raise Exception("Process is already running")
|
||||
self.process = Popen(self.cmd, stdin=stdin, stdout=stdout)
|
||||
|
||||
def __del__(self):
|
||||
if self.process:
|
||||
self.process.terminate()
|
||||
|
||||
def __del__(self):
|
||||
if self.process:
|
||||
self.process.terminate()
|
||||
pass
|
||||
return
|
||||
pass
|
||||
|
||||
class LineProcessor(ProcessWrapper):
|
||||
|
||||
def __call__(self,input):
|
||||
if not self.process: self.start()
|
||||
self.process.stdin.write("%s\n"%input.strip())
|
||||
self.process.stdin.flush()
|
||||
return self.process.stdout.readline().strip()
|
||||
pass
|
||||
def __call__(self, input):
|
||||
if not self.process:
|
||||
self.start()
|
||||
self.process.stdin.write("%s\n" % input.strip())
|
||||
self.process.stdin.flush()
|
||||
return self.process.stdout.readline().strip()
|
||||
|
||||
|
||||
class SentenceSplitter(ProcessWrapper):
|
||||
"""
|
||||
Wrapper for standard Moses sentence splitter
|
||||
"""
|
||||
def __init__(self,lang):
|
||||
ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl"
|
||||
self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang]
|
||||
self.process = None
|
||||
return
|
||||
"""Wrapper for standard Moses sentence splitter."""
|
||||
|
||||
def __init__(self, lang):
|
||||
ssplit_cmd = moses_root + "/scripts/ems/support/split-sentences.perl"
|
||||
self.cmd = [ssplit_cmd, "-b", "-q", "-l", lang]
|
||||
self.process = None
|
||||
|
||||
def __call__(self, input):
|
||||
if not self.process:
|
||||
self.start()
|
||||
self.process.stdin.write(input.strip() + "\n<P>\n")
|
||||
self.process.stdin.flush()
|
||||
x = self.process.stdout.readline().strip()
|
||||
ret = []
|
||||
while x != '<P>' and x != '':
|
||||
ret.append(x)
|
||||
x = self.process.stdout.readline().strip()
|
||||
return ret
|
||||
|
||||
def __call__(self,input):
|
||||
if not self.process:
|
||||
self.start()
|
||||
pass
|
||||
self.process.stdin.write(input.strip() + "\n<P>\n")
|
||||
self.process.stdin.flush()
|
||||
x = self.process.stdout.readline().strip()
|
||||
ret = []
|
||||
while x != '<P>' and x != '':
|
||||
ret.append(x)
|
||||
x = self.process.stdout.readline().strip()
|
||||
pass
|
||||
return ret
|
||||
|
||||
class Pretokenizer(LineProcessor):
|
||||
"""
|
||||
Pretokenizer wrapper; the pretokenizer fixes known issues with the input.
|
||||
"""
|
||||
def __init__(self,lang):
|
||||
pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl"
|
||||
self.cmd = [pretok_cmd,"-b", "-q", "-l",lang]
|
||||
self.process = None
|
||||
return
|
||||
pass
|
||||
"""Pretokenizer wrapper.
|
||||
|
||||
The pretokenizer fixes known issues with the input.
|
||||
"""
|
||||
def __init__(self, lang):
|
||||
pretok_cmd = moses_root + "/scripts/tokenizer/pre-tokenizer.perl"
|
||||
self.cmd = [pretok_cmd, "-b", "-q", "-l", lang]
|
||||
self.process = None
|
||||
|
||||
|
||||
class Tokenizer(LineProcessor):
|
||||
"""
|
||||
Tokenizer wrapper; the pretokenizer fixes known issues with the input.
|
||||
"""
|
||||
def __init__(self,lang,args=["-a","-no-escape"]):
|
||||
tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl"
|
||||
self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args
|
||||
self.process = None
|
||||
return
|
||||
|
||||
"""Tokenizer wrapper.
|
||||
|
||||
The pretokenizer fixes known issues with the input.
|
||||
"""
|
||||
def __init__(self, lang, args=["-a", "-no-escape"]):
|
||||
tok_cmd = moses_root + "/scripts/tokenizer/tokenizer.perl"
|
||||
self.cmd = [tok_cmd, "-b", "-q", "-l", lang] + args
|
||||
self.process = None
|
||||
|
||||
|
||||
class Truecaser(LineProcessor):
|
||||
"""
|
||||
Truecaser wrapper.
|
||||
"""
|
||||
def __init__(self,model):
|
||||
truecase_cmd = moses_root+"/scripts/recaser/truecase.perl"
|
||||
self.cmd = [truecase_cmd,"-b", "--model",model]
|
||||
self.process = None
|
||||
return
|
||||
pass
|
||||
"""Truecaser wrapper."""
|
||||
def __init__(self, model):
|
||||
truecase_cmd = moses_root + "/scripts/recaser/truecase.perl"
|
||||
self.cmd = [truecase_cmd, "-b", "--model", model]
|
||||
self.process = None
|
||||
|
||||
|
||||
class LineProcessorPipeline:
|
||||
"""
|
||||
Line processor: one line in, one line out
|
||||
"""
|
||||
def __init__(self,parts=[]):
|
||||
self.chain = [LineProcessor(p.cmd) for p in parts]
|
||||
return
|
||||
|
||||
def start(self):
|
||||
if len(self.chain) == 0:
|
||||
return
|
||||
if self.chain[0].process:
|
||||
return
|
||||
self.chain[0].start()
|
||||
for i in xrange(1,len(self.chain)):
|
||||
self.chain[i].start(stdin = self.chain[i-1].process.stdout)
|
||||
pass
|
||||
return
|
||||
"""Line processor: one line in, one line out."""
|
||||
def __init__(self, parts=[]):
|
||||
self.chain = [LineProcessor(p.cmd) for p in parts]
|
||||
|
||||
def __call__(self,input):
|
||||
if len(self.chain) == 0:
|
||||
return input
|
||||
self.start()
|
||||
self.chain[0].process.stdin.write("%s\n"%input.strip())
|
||||
self.chain[0].process.stdin.flush()
|
||||
return self.chain[0].process.stdout.readline().strip()
|
||||
def start(self):
|
||||
if len(self.chain) == 0:
|
||||
return
|
||||
if self.chain[0].process:
|
||||
return
|
||||
self.chain[0].start()
|
||||
for i in xrange(1, len(self.chain)):
|
||||
self.chain[i].start(stdin=self.chain[i - 1].process.stdout)
|
||||
|
||||
def __call__(self, input):
|
||||
if len(self.chain) == 0:
|
||||
return input
|
||||
self.start()
|
||||
self.chain[0].process.stdin.write("%s\n" % input.strip())
|
||||
self.chain[0].process.stdin.flush()
|
||||
return self.chain[0].process.stdout.readline().strip()
|
||||
|
||||
pass
|
||||
|
||||
def find_free_port(p):
|
||||
"""
|
||||
Find a free port, starting at /p/.
|
||||
Return the free port, or False if none found.
|
||||
"""
|
||||
ret = p
|
||||
while ret - p < 20:
|
||||
devnull = open(os.devnull,"w")
|
||||
n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull)
|
||||
if n.communicate()[0].find(":%d "%ret) < 0:
|
||||
return p
|
||||
ret += 1
|
||||
pass
|
||||
return False
|
||||
"""Find a free port, starting at /p/.
|
||||
|
||||
:return: The free port, or False if none found.
|
||||
"""
|
||||
ret = p
|
||||
while ret - p < 20:
|
||||
devnull = open(os.devnull, "w")
|
||||
n = Popen(["netstat", "-tnp"], stdout=PIPE, stderr=devnull)
|
||||
if n.communicate()[0].find(":%d " % ret) < 0:
|
||||
return p
|
||||
ret += 1
|
||||
return False
|
||||
|
||||
|
||||
class MosesServer(ProcessWrapper):
|
||||
|
||||
def __init__(self,args=[]):
|
||||
self.process = None
|
||||
mserver_cmd = moses_root+"/bin/mosesserver"
|
||||
self.cmd = [mserver_cmd] + args
|
||||
self.url = None
|
||||
self.proxy = None
|
||||
return
|
||||
|
||||
def start(self,config=None,args=[],port=7447,debug=False):
|
||||
self.cmd.extend(args)
|
||||
if config:
|
||||
if "-f" in args:
|
||||
raise Exception("Config file specified twice")
|
||||
else:
|
||||
self.cmd.extend(["-f",config])
|
||||
pass
|
||||
pass
|
||||
self.port = port # find_free_port(port)
|
||||
if not self.port:
|
||||
raise Excpetion("Cannot find free port for moses server!")
|
||||
self.cmd.extend(["--server-port", "%d"%self.port])
|
||||
if debug:
|
||||
print >>sys.stderr,self.cmd
|
||||
# self.stderr = open("mserver.%d.stderr"%self.port,'w')
|
||||
# self.stdout = open("mserver.%d.stdout"%self.port,'w')
|
||||
# self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
|
||||
self.process = Popen(self.cmd)
|
||||
else:
|
||||
devnull = open(os.devnull,"w")
|
||||
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
|
||||
pass
|
||||
|
||||
if self.process.poll():
|
||||
raise Exception("FATAL ERROR: Could not launch moses server!")
|
||||
if debug:
|
||||
print >>sys.stderr,"MOSES port is %d."%self.port
|
||||
print >>sys.stderr,"Moses poll status is", self.process.poll()
|
||||
pass
|
||||
|
||||
self.url = "http://localhost:%d/RPC2"%self.port
|
||||
self.connect(self.url)
|
||||
|
||||
return True
|
||||
|
||||
def connect(self,url):
|
||||
if url[:4] != "http": url = "http://%s"%url
|
||||
if url[-5:] != "/RPC2": url += "/RPC2"
|
||||
self.url = url
|
||||
self.proxy = xmlrpclib.ServerProxy(self.url)
|
||||
return
|
||||
|
||||
def translate(self,input):
|
||||
attempts = 0
|
||||
while attempts < 100:
|
||||
try:
|
||||
if type(input) is unicode:
|
||||
# if the server does not expect unicode, provide a
|
||||
# properly encoded string!
|
||||
param = {'text': input.strip().encode('utf8')}
|
||||
return self.proxy.translate(param)['text'].decode('utf8')
|
||||
|
||||
elif type(input) is str:
|
||||
param = {'text': input.strip()}
|
||||
return self.proxy.translate(param)['text']
|
||||
|
||||
elif type(input) is list:
|
||||
return [self.translate(x) for x in input]
|
||||
|
||||
elif type(input) is dict:
|
||||
return self.proxy.translate(input)
|
||||
def __init__(self, args=[]):
|
||||
self.process = None
|
||||
mserver_cmd = moses_root + "/bin/mosesserver"
|
||||
self.cmd = [mserver_cmd] + args
|
||||
self.url = None
|
||||
self.proxy = None
|
||||
|
||||
def start(self, config=None, args=[], port=7447, debug=False):
|
||||
self.cmd.extend(args)
|
||||
if config:
|
||||
if "-f" in args:
|
||||
raise Exception("Config file specified twice")
|
||||
else:
|
||||
self.cmd.extend(["-f", config])
|
||||
self.port = port # find_free_port(port)
|
||||
if not self.port:
|
||||
raise Exception("Cannot find free port for moses server!")
|
||||
self.cmd.extend(["--server-port", "%d" % self.port])
|
||||
if debug:
|
||||
print >>sys.stderr, self.cmd
|
||||
# self.stderr = open("mserver.%d.stderr"%self.port,'w')
|
||||
# self.stdout = open("mserver.%d.stdout"%self.port,'w')
|
||||
# self.process = Popen(
|
||||
# self.cmd, stderr=self.stderr, stdout=self.stdout)
|
||||
self.process = Popen(self.cmd)
|
||||
else:
|
||||
raise Exception("Can't handle input of this type!")
|
||||
devnull = open(os.devnull, "w")
|
||||
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
|
||||
|
||||
except:
|
||||
attempts += 1
|
||||
print >>sys.stderr, "WAITING", attempts
|
||||
time.sleep(1)
|
||||
pass
|
||||
pass
|
||||
raise Exception("Translation request failed")
|
||||
pass
|
||||
if self.process.poll():
|
||||
raise Exception("FATAL ERROR: Could not launch moses server!")
|
||||
if debug:
|
||||
print >>sys.stderr, "MOSES port is %d." % self.port
|
||||
print >>sys.stderr, "Moses poll status is", self.process.poll()
|
||||
|
||||
self.url = "http://localhost:%d/RPC2" % self.port
|
||||
self.connect(self.url)
|
||||
|
||||
return True
|
||||
|
||||
def connect(self, url):
|
||||
if url[:4] != "http":
|
||||
url = "http://%s" % url
|
||||
if url[-5:] != "/RPC2":
|
||||
url += "/RPC2"
|
||||
self.url = url
|
||||
self.proxy = xmlrpclib.ServerProxy(self.url)
|
||||
|
||||
def translate(self, input):
|
||||
attempts = 0
|
||||
while attempts < 100:
|
||||
try:
|
||||
if type(input) is unicode:
|
||||
# If the server does not expect unicode, provide a
|
||||
# properly encoded string!
|
||||
param = {'text': input.strip().encode('utf8')}
|
||||
return self.proxy.translate(param)['text'].decode('utf8')
|
||||
|
||||
elif type(input) is str:
|
||||
param = {'text': input.strip()}
|
||||
return self.proxy.translate(param)['text']
|
||||
|
||||
elif type(input) is list:
|
||||
return [self.translate(x) for x in input]
|
||||
|
||||
elif type(input) is dict:
|
||||
return self.proxy.translate(input)
|
||||
|
||||
else:
|
||||
raise Exception("Can't handle input of this type!")
|
||||
|
||||
except:
|
||||
attempts += 1
|
||||
print >>sys.stderr, "WAITING", attempts
|
||||
time.sleep(1)
|
||||
raise Exception("Translation request failed")
|
||||
|
@ -5,29 +5,39 @@
|
||||
# This script simulates post-editing of MT output and incrementally
|
||||
# updates the dynamic phrase tables in the moses server.
|
||||
|
||||
import xmlrpclib,datetime,argparse,sys,os,time
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import xmlrpclib
|
||||
import moses
|
||||
from moses import MosesServer
|
||||
from subprocess import *
|
||||
from subprocess import (
|
||||
PIPE,
|
||||
Popen,
|
||||
)
|
||||
|
||||
|
||||
mserver = moses.MosesServer()
|
||||
|
||||
# We must perform some custom argument processing, as moses parameter
|
||||
# specifications do not comply with the standards used in standard
|
||||
# argument parsing packages; an isolated double dash separates script
|
||||
# arguments from moses arguments
|
||||
|
||||
|
||||
def split_args(all_args):
|
||||
"""
|
||||
Split argument list all_args into arguments specific to this script and
|
||||
arguments relating to the moses server. An isolated double dash acts as
|
||||
the separator between the two types of arguments.
|
||||
arguments relating to the moses server. An isolated double dash acts as
|
||||
the separator between the two types of arguments.
|
||||
"""
|
||||
my_args = []
|
||||
mo_args = []
|
||||
arglist = mo_args
|
||||
i = 0
|
||||
# IMPORTANT: the code below must be coordinated with
|
||||
# IMPORTANT: the code below must be coordinated with
|
||||
# - the evolution of moses command line arguments
|
||||
# - mert-moses.pl
|
||||
# - mert-moses.pl
|
||||
while i < len(all_args):
|
||||
# print i,"MY_ARGS", my_args
|
||||
# print i,"MO_ARGS", mo_args
|
||||
@ -36,14 +46,16 @@ def split_args(all_args):
|
||||
elif all_args[i] == "--]":
|
||||
arglist = mo_args
|
||||
elif all_args[i] == "-i" or all_args[i] == "-input-file":
|
||||
my_args.extend(["-i",all_args[i+1]])
|
||||
my_args.extend(["-i", all_args[i + 1]])
|
||||
i += 1
|
||||
elif all_args[i] == "-inputtype":
|
||||
if all_args[i+1] != "0":
|
||||
# not yet supported! Therefore:
|
||||
errmsg = "FATAL ERROR: %s "%sys.argv[0]
|
||||
errmsg += "only supports plain text input at this point."
|
||||
raise Exception(errsmg)
|
||||
if all_args[i + 1] != "0":
|
||||
# Not yet supported! Therefore:
|
||||
errmsg = (
|
||||
"FATAL ERROR: "
|
||||
"%s only supports plain text input at this point."
|
||||
% sys.argv[0])
|
||||
raise Exception(errmsg)
|
||||
# my_args.extend(["--input-type",all_args[i+1]])
|
||||
i += 1
|
||||
elif all_args[i] == "-lattice-samples":
|
||||
@ -52,13 +64,14 @@ def split_args(all_args):
|
||||
# mo_args[i:i+3] = []
|
||||
# i += 2
|
||||
# This is not yet supported! Therefore:
|
||||
errmsg = "FATAL ERROR: %s "%sys.argv[0]
|
||||
errmsg += "does not yet support lattice sampling."
|
||||
raise Exception(errsmg)
|
||||
|
||||
errmsg = (
|
||||
"FATAL ERROR: %s does not yet support lattice sampling."
|
||||
% sys.argv[0])
|
||||
raise Exception(errmsg)
|
||||
|
||||
elif all_args[i] == "-n-best-list":
|
||||
my_args.extend(["--nbest",all_args[i+2]])
|
||||
my_args.extend(["--nbest-file",all_args[i+1]])
|
||||
my_args.extend(["--nbest", all_args[i + 2]])
|
||||
my_args.extend(["--nbest-file", all_args[i + 1]])
|
||||
i += 2
|
||||
|
||||
elif all_args[i] == "-n-best-distinct":
|
||||
@ -70,128 +83,148 @@ def split_args(all_args):
|
||||
|
||||
i += 1
|
||||
pass
|
||||
return my_args,mo_args
|
||||
|
||||
return my_args, mo_args
|
||||
|
||||
|
||||
def interpret_args(my_args):
|
||||
"""
|
||||
Parse script-specific argument list.
|
||||
"""
|
||||
aparser = argparse.ArgumentParser()
|
||||
|
||||
aparser.add_argument("-s","--server-cmd",default="mosesserver",
|
||||
dest="servercmd", help="path to moses server command")
|
||||
aparser.add_argument("--url",help="URL of external moses server.")
|
||||
aparser.add_argument("-p","--port", type=int, default=7447,
|
||||
help="port number to be used for server")
|
||||
|
||||
# input / output
|
||||
aparser.add_argument("-i","--input",help="source file",default="-")
|
||||
aparser.add_argument("-r","--ref",help="reference translation",default=None)
|
||||
aparser.add_argument("-a","--aln",help="alignment",default=None)
|
||||
aparser.add_argument("-o","--output",default="-",help="output file")
|
||||
aparser.add_argument("-d","--debug",action="store_true",help="debug mode")
|
||||
|
||||
# moses reporting options
|
||||
aparser.add_argument("-A","--with-alignment", dest="A",
|
||||
help="include alignment in output", action="store_true")
|
||||
aparser.add_argument("-G","--with-graph",type=bool, default=False, dest="G",
|
||||
help="include search graph info in output")
|
||||
aparser.add_argument("-T","--with-transopt",type=bool, default=False, dest = "T",
|
||||
help="include translation options info in output")
|
||||
aparser.add_argument("-F","--report-all-factors", action="store_true",dest="F",
|
||||
help="report all factors")
|
||||
aparser.add_argument("-n","--nbest",type=int,dest="nbest",default=0,
|
||||
help="size of nbest list")
|
||||
aparser.add_argument("-N","--nbest-file",dest="nbestFile",default=0,
|
||||
help="output file for nbest list")
|
||||
aparser.add_argument("-u","--nbest-distinct",type=bool,dest="U",default=False,
|
||||
help="report all factors")
|
||||
aparser.add_argument(
|
||||
"-s", "--server-cmd", default="mosesserver", dest="servercmd",
|
||||
help="Path to moses server command.")
|
||||
aparser.add_argument(
|
||||
"--url", help="URL of external moses server.")
|
||||
aparser.add_argument(
|
||||
"-p", "--port", type=int, default=7447,
|
||||
help="Port number to be used for server.")
|
||||
|
||||
# Input / output.
|
||||
aparser.add_argument(
|
||||
"-i", "--input", default='-', help="source file")
|
||||
aparser.add_argument(
|
||||
"-r", "--ref", default=None, help="Reference translation.")
|
||||
aparser.add_argument(
|
||||
"-a", "--aln", default=None, help="Alignment.")
|
||||
aparser.add_argument(
|
||||
"-o", "--output", default="-", help="Output file.")
|
||||
aparser.add_argument(
|
||||
"-d", "--debug", action='store_true', help="Debug mode.")
|
||||
|
||||
# Moses reporting options.
|
||||
aparser.add_argument(
|
||||
"-A", "--with-alignment", dest="A", action='store_true',
|
||||
help="Include alignment in output.")
|
||||
aparser.add_argument(
|
||||
"-G", "--with-graph", type=bool, default=False, dest="G",
|
||||
help="Include search graph info in output.")
|
||||
aparser.add_argument(
|
||||
"-T", "--with-transopt", type=bool, default=False, dest="T",
|
||||
help="Include translation options info in output.")
|
||||
aparser.add_argument(
|
||||
"-F", "--report-all-factors", action="store_true", dest="F",
|
||||
help="Report all factors.")
|
||||
aparser.add_argument(
|
||||
"-n", "--nbest", type=int, dest="nbest", default=0,
|
||||
help="Size of nbest list.")
|
||||
aparser.add_argument(
|
||||
"-N", "--nbest-file", dest="nbestFile", default=0,
|
||||
help="Output file for nbest list.")
|
||||
aparser.add_argument(
|
||||
"-u", "--nbest-distinct", type=bool, dest="U", default=False,
|
||||
help="Report all factors.")
|
||||
|
||||
return aparser.parse_args(my_args)
|
||||
|
||||
|
||||
|
||||
def translate(proxy, args, line):
|
||||
if type(line) is unicode:
|
||||
param = { 'text' : line.strip().encode('utf8') }
|
||||
param = {'text': line.strip().encode('utf8')}
|
||||
elif type(line) is str:
|
||||
param = { 'text' : line.strip() }
|
||||
param = {'text': line.strip()}
|
||||
else:
|
||||
raise Exception("Can't handle input")
|
||||
if args.A: param['align'] = True
|
||||
if args.T: param['topt'] = True
|
||||
if args.F: param['report-all-factors'] = True
|
||||
if args.nbest:
|
||||
if args.A:
|
||||
param['align'] = True
|
||||
if args.T:
|
||||
param['topt'] = True
|
||||
if args.F:
|
||||
param['report-all-factors'] = True
|
||||
if args.nbest:
|
||||
param['nbest'] = int(args.nbest)
|
||||
param['add-score-breakdown'] = True
|
||||
pass
|
||||
if args.U:
|
||||
if args.U:
|
||||
param['nbest-distinct'] = True
|
||||
pass
|
||||
attempts = 0
|
||||
while attempts < 20:
|
||||
t1 = time.time()
|
||||
try:
|
||||
return proxy.translate(param)
|
||||
return proxy.translate(param)
|
||||
|
||||
# except xmlrpclib.Fault as e:
|
||||
# except xmlrpclib.ProtocolError as e:
|
||||
# except xmlrpclib.ResponseError as e:
|
||||
except xmlrpclib.Error as e:
|
||||
time.sleep(2) # give all the stderr stuff a chance to be flushed
|
||||
print >>sys.stderr," XMLRPC error:",e
|
||||
sys.stderr.flush()
|
||||
print >>sys.stderr, " XMLRPC error:", e
|
||||
print >>sys.stderr, "Input was"
|
||||
print >>sys.stderr, param
|
||||
sys.exit(1)
|
||||
|
||||
except IOError as e:
|
||||
print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
|
||||
print >>sys.stderr, (
|
||||
"I/O error({0}): {1}".format(e.errno, e.strerror))
|
||||
time.sleep(5)
|
||||
|
||||
except:
|
||||
serverstatus = mserver.process.poll()
|
||||
if serverstatus == None:
|
||||
print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
|
||||
if serverstatus is None:
|
||||
print >>sys.stderr, (
|
||||
"Connection failed after %f seconds" % (time.time() - t1))
|
||||
attempts += 1
|
||||
if attempts > 10:
|
||||
time.sleep(10)
|
||||
else:
|
||||
time.sleep(5)
|
||||
pass
|
||||
else:
|
||||
|
||||
print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
|
||||
%(serverstatus/256,serverstatus%256)
|
||||
print >>sys.stderr, (
|
||||
"Oopsidaisy, server exited with code %d (signal %d)"
|
||||
% (serverstatus / 256, serverstatus % 256))
|
||||
pass
|
||||
pass
|
||||
pass
|
||||
raise Exception("Exception: could not reach translation server.")
|
||||
|
||||
|
||||
|
||||
def read_data(fname):
|
||||
"""
|
||||
Read and return data (source, target or alignment) from file fname.
|
||||
"""
|
||||
if fname[-3:] == ".gz":
|
||||
foo = Popen(["zcat",fname],stdout=PIPE)\
|
||||
.communicate()[0]\
|
||||
.strip().split('\n')
|
||||
process = Popen(["zcat", fname], stdout=PIPE)
|
||||
stdout, _ = process.communicate()
|
||||
foo = stdout.strip().split('\n')
|
||||
else:
|
||||
foo = [x.strip() for x in open(fname).readlines()]
|
||||
pass
|
||||
return foo
|
||||
|
||||
def repack_result(idx,result):
|
||||
|
||||
def repack_result(idx, result):
|
||||
global args
|
||||
if args.nbest:
|
||||
for h in result['nbest']:
|
||||
fields = [idx,h['hyp'],h['fvals'],h['totalScore']]
|
||||
fields = [idx, h['hyp'], h['fvals'], h['totalScore']]
|
||||
for i in xrange(len(fields)):
|
||||
if type(fields[i]) is unicode:
|
||||
fields[i] = fields[i].encode('utf-8')
|
||||
pass
|
||||
pass
|
||||
# print fields
|
||||
print >>NBestFile,"%d ||| %s ||| %s ||| %f"%tuple(fields)
|
||||
pass
|
||||
# Print fields.
|
||||
print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields)
|
||||
pass
|
||||
if 'align' in result:
|
||||
t = result['text'].split()
|
||||
@ -200,16 +233,14 @@ def repack_result(idx,result):
|
||||
k = 0
|
||||
for a in result['align']:
|
||||
k = a['tgt-start']
|
||||
if k: print " ".join(t[i:k]).encode('utf8'),span,
|
||||
if k:
|
||||
print " ".join(t[i:k]).encode('utf8'), span,
|
||||
i = k
|
||||
span = "|%d %d|"%(a['src-start'],a['src-end'])
|
||||
pass
|
||||
print " ".join(t[k:]).encode('utf8'),span
|
||||
pass
|
||||
span = "|%d %d|" % (a['src-start'], a['src-end'])
|
||||
print " ".join(t[k:]).encode('utf8'), span
|
||||
else:
|
||||
print result['text'].encode('utf8')
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
my_args, mo_args = split_args(sys.argv[1:])
|
||||
@ -221,17 +252,17 @@ if __name__ == "__main__":
|
||||
args = interpret_args(my_args)
|
||||
|
||||
if "-show-weights" in mo_args:
|
||||
# this is for use during tuning, where moses is called to get a list of
|
||||
# feature names
|
||||
devnull = open(os.devnull,"w")
|
||||
mo = Popen(mserver.cmd + mo_args,stdout=PIPE,stderr=devnull)
|
||||
# This is for use during tuning, where moses is called to get a list
|
||||
# of feature names.
|
||||
devnull = open(os.devnull, "w")
|
||||
mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull)
|
||||
print mo.communicate()[0].strip()
|
||||
sys.exit(0)
|
||||
pass
|
||||
|
||||
if args.nbest:
|
||||
if args.nbestFile:
|
||||
NBestFile = open(args.nbestFile,"w")
|
||||
NBestFile = open(args.nbestFile, "w")
|
||||
else:
|
||||
NBestFile = sys.stdout
|
||||
pass
|
||||
@ -239,8 +270,10 @@ if __name__ == "__main__":
|
||||
|
||||
ref = None
|
||||
aln = None
|
||||
if args.ref: ref = read_data(args.ref)
|
||||
if args.aln: aln = read_data(args.aln)
|
||||
if args.ref:
|
||||
ref = read_data(args.ref)
|
||||
if args.aln:
|
||||
aln = read_data(args.aln)
|
||||
|
||||
if ref and aln:
|
||||
try:
|
||||
@ -260,25 +293,21 @@ if __name__ == "__main__":
|
||||
line = sys.stdin.readline()
|
||||
idx = 0
|
||||
while line:
|
||||
result = translate(mserver.proxy,args,line)
|
||||
repack_result(idx,result)
|
||||
result = translate(mserver.proxy, args, line)
|
||||
repack_result(idx, result)
|
||||
line = sys.stdin.readline()
|
||||
idx += 1
|
||||
pass
|
||||
pass
|
||||
else:
|
||||
src = read_data(args.input)
|
||||
for i in xrange(len(src)):
|
||||
result = translate(mserver.proxy,args,src[i])
|
||||
repack_result(i,result)
|
||||
result = translate(mserver.proxy, args, src[i])
|
||||
repack_result(i, result)
|
||||
if args.debug:
|
||||
print >>sys.stderr, result['text'].encode('utf-8')
|
||||
pass
|
||||
if ref and aln:
|
||||
result = mserver.proxy.updater({'source' : src[i],
|
||||
'target' : ref[i],
|
||||
'alignment' : aln[i]})
|
||||
pass
|
||||
pass
|
||||
pass
|
||||
pass
|
||||
if ref and aln:
|
||||
result = mserver.proxy.updater({
|
||||
'source': src[i],
|
||||
'target': ref[i],
|
||||
'alignment': aln[i],
|
||||
})
|
||||
|
@ -2,12 +2,12 @@
|
||||
|
||||
"""
|
||||
The Gacha filter cleans out sentence pairs that have global character mean
|
||||
lower than a certain threshold.
|
||||
|
||||
Use this cleaner to produce low quantity of high quality sentence pairs.
|
||||
lower than a certain threshold.
|
||||
|
||||
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
|
||||
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
|
||||
Use this cleaner to produce low quantity of high quality sentence pairs.
|
||||
|
||||
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
|
||||
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
|
||||
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
|
||||
|
||||
This is inspired by the global character mean that is used in the Gale-Church
|
||||
@ -24,17 +24,24 @@ where:
|
||||
(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf)
|
||||
"""
|
||||
|
||||
import io, subprocess
|
||||
import io
|
||||
import subprocess
|
||||
|
||||
|
||||
red = '\033[01;31m'
|
||||
native = '\033[m'
|
||||
|
||||
|
||||
def err_msg(txt):
|
||||
return red+txt+native
|
||||
return red + txt + native
|
||||
|
||||
|
||||
def num_char(filename):
|
||||
return float(subprocess.Popen(["wc", "-m", filename],
|
||||
stdout=subprocess.PIPE).stdout.read().split()[0])
|
||||
process = subprocess.Popen(
|
||||
["wc", "-m", filename], stdout=subprocess.PIPE)
|
||||
# TODO: Was this meant to call communicate()?
|
||||
return float(process.stdout.read().split()[0])
|
||||
|
||||
|
||||
def gacha_mean(sourcefile, targetfile):
|
||||
"""
|
||||
@ -43,36 +50,44 @@ def gacha_mean(sourcefile, targetfile):
|
||||
"""
|
||||
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
|
||||
c = num_char(sourcefile) / num_char(targetfile)
|
||||
sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
|
||||
sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
|
||||
sys.stderr.write(err_msg('Filtering starts ...\n'))
|
||||
return c
|
||||
|
||||
|
||||
def io_open(path):
|
||||
"""Open file `path` for reading, as a UTF-8 text file."""
|
||||
return io.open(path, 'r', encoding='utf8')
|
||||
|
||||
|
||||
def main(sourcefile, targetfile, threshold=0.2):
|
||||
# Calculates Gacha mean.
|
||||
c = gacha_mean(sourcefile, targetfile)
|
||||
# Calculates lower and upperbound for filtering
|
||||
threshold = float(threshold)
|
||||
lowerbound = (1-threshold) * c
|
||||
upperbound = (1+threshold) * c
|
||||
|
||||
lowerbound = (1 - threshold) * c
|
||||
upperbound = (1 + threshold) * c
|
||||
|
||||
# Start filtering sentences.
|
||||
with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
|
||||
io.open(targetfile, 'r', encoding='utf8') as trgfin:
|
||||
with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
|
||||
for s, t in zip(srcfin, trgfin):
|
||||
if lowerbound < len(s) / float(len(t)) < upperbound:
|
||||
print(u"{}\t{}\n".format(s.strip(),t.strip()))
|
||||
print(u"{}\t{}\n".format(s.strip(), t.strip()))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) not in range(3,5):
|
||||
usage_msg = err_msg('Usage: python %s srcfile trgfile (threshold)\n'
|
||||
% sys.argv[0])
|
||||
|
||||
example_msg = err_msg('Example: gacha_cleaning.py ~/Europarl.de-en.de '
|
||||
'~/Europarl.de-en.en 0.4\n'
|
||||
% sys.argv[0])
|
||||
if len(sys.argv) not in range(3, 5):
|
||||
usage_msg = err_msg(
|
||||
"Usage: python %s srcfile trgfile (threshold)\n"
|
||||
% sys.argv[0])
|
||||
|
||||
example_msg = err_msg(
|
||||
"Example: "
|
||||
"gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n"
|
||||
% sys.argv[0])
|
||||
sys.stderr.write(usage_msg)
|
||||
sys.stderr.write(example_msg)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
main(*sys.argv[1:])
|
||||
|
@ -24,9 +24,11 @@
|
||||
import optparse
|
||||
import sys
|
||||
|
||||
|
||||
class NGram(tuple):
|
||||
pass
|
||||
|
||||
|
||||
class Gap:
|
||||
def __init__(self, minSpan):
|
||||
self.minSpan = minSpan
|
||||
@ -34,8 +36,12 @@ class Gap:
|
||||
def getMinSpan(self):
|
||||
return self.minSpan
|
||||
|
||||
|
||||
def printUsage():
|
||||
sys.stderr.write("Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
|
||||
sys.stderr.write(
|
||||
"Usage: "
|
||||
"filter-rule-table.py [--min-non-initial-rule-count=N] INPUT")
|
||||
|
||||
|
||||
def main():
|
||||
parser = optparse.OptionParser()
|
||||
@ -54,14 +60,15 @@ def main():
|
||||
inputSentences.append(line.split())
|
||||
filterRuleTable(sys.stdin, inputSentences, N, options)
|
||||
|
||||
|
||||
def filterRuleTable(ruleTable, inputSentences, N, options):
|
||||
# Map each input n-gram (n = 1..N) to a map from sentence indices to
|
||||
# lists of intra-sentence indices.
|
||||
occurrences = {}
|
||||
for i, sentence in enumerate(inputSentences):
|
||||
for n in range(1, N+1):
|
||||
for j in range(0, len(sentence)-n+1):
|
||||
ngram = NGram(sentence[j:j+n])
|
||||
for n in range(1, N + 1):
|
||||
for j in range(0, len(sentence) - n + 1):
|
||||
ngram = NGram(sentence[j:j + n])
|
||||
innerMap = occurrences.setdefault(ngram, {})
|
||||
indices = innerMap.setdefault(i, [])
|
||||
indices.append(j)
|
||||
@ -70,15 +77,16 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
|
||||
prevRuleIncluded = None
|
||||
for line in ruleTable:
|
||||
rhs, count = parseRule(line)
|
||||
below_threshold = (count is not None and count < options.minCount)
|
||||
# Prune non-initial rule if count is below threshold.
|
||||
if count != None and count < options.minCount and isNonInitialRule(rhs):
|
||||
if below_threshold and isNonInitialRule(rhs):
|
||||
if prevRHS != rhs:
|
||||
prevRuleIncluded = None
|
||||
prevRHS = rhs
|
||||
continue
|
||||
# If source RHS is same as last rule's then we already know whether to
|
||||
# filter or not (unless it was pruned before checking).
|
||||
if rhs == prevRHS and prevRuleIncluded != None:
|
||||
if rhs == prevRHS and prevRuleIncluded is not None:
|
||||
if prevRuleIncluded:
|
||||
print line,
|
||||
continue
|
||||
@ -89,7 +97,10 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
|
||||
prevRuleIncluded = True
|
||||
continue
|
||||
segments = segmentRHS(rhs, N)
|
||||
ngramMaps = [occurrences.get(s, {}) for s in segments if isinstance(s, NGram)]
|
||||
ngramMaps = [
|
||||
occurrences.get(s, {})
|
||||
for s in segments
|
||||
if isinstance(s, NGram)]
|
||||
if len(ngramMaps) == 0:
|
||||
print line,
|
||||
prevRuleIncluded = True
|
||||
@ -111,9 +122,13 @@ def filterRuleTable(ruleTable, inputSentences, N, options):
|
||||
break
|
||||
prevRuleIncluded = match
|
||||
|
||||
# Parse a line of the rule table and return a tuple containing two items,
|
||||
# the list of RHS source symbols and the rule count (if present).
|
||||
|
||||
def parseRule(line):
|
||||
"""Parse a line of the rule table.
|
||||
|
||||
:return: A tuple containing two items: the list of RHS source symbols,
|
||||
and the rule count (if present).
|
||||
"""
|
||||
cols = line.split("|||")
|
||||
rhsSourceSymbols = cols[0].split()[:-1]
|
||||
ruleCount = None
|
||||
@ -123,15 +138,18 @@ def parseRule(line):
|
||||
ruleCount = float(counts[2])
|
||||
return (rhsSourceSymbols, ruleCount)
|
||||
|
||||
|
||||
def isNT(symbol):
|
||||
return symbol[0] == '[' and symbol[-1] == ']'
|
||||
|
||||
|
||||
def isNonInitialRule(rhs):
|
||||
for symbol in rhs:
|
||||
if isNT(symbol):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def segmentRHS(rhs, N):
|
||||
segments = []
|
||||
terminals = []
|
||||
@ -159,13 +177,14 @@ def segmentRHS(rhs, N):
|
||||
segments.append(NGram(terminals))
|
||||
return segments
|
||||
|
||||
|
||||
def matchSegments(segments, indexSeq, sentenceLength):
|
||||
assert len(segments) > 0
|
||||
firstSegment = segments[0]
|
||||
i = 0
|
||||
if isinstance(firstSegment, Gap):
|
||||
minPos = firstSegment.getMinSpan()
|
||||
maxPos = sentenceLength-1
|
||||
maxPos = sentenceLength - 1
|
||||
else:
|
||||
minPos = indexSeq[i] + len(firstSegment)
|
||||
i += 1
|
||||
@ -175,7 +194,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
|
||||
if minPos + segment.getMinSpan() > sentenceLength:
|
||||
return False
|
||||
minPos = minPos + segment.getMinSpan()
|
||||
maxPos = sentenceLength-1
|
||||
maxPos = sentenceLength - 1
|
||||
else:
|
||||
pos = indexSeq[i]
|
||||
i += 1
|
||||
@ -185,6 +204,7 @@ def matchSegments(segments, indexSeq, sentenceLength):
|
||||
maxPos = minPos
|
||||
return True
|
||||
|
||||
|
||||
def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
|
||||
assert len(ngramMaps) > 0
|
||||
if len(ngramMaps) == 1:
|
||||
@ -195,7 +215,7 @@ def enumerateIndexSeqs(ngramMaps, sentenceIndex, minFirstIndex):
|
||||
for index in ngramMaps[0][sentenceIndex]:
|
||||
if index < minFirstIndex:
|
||||
continue
|
||||
for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index+1):
|
||||
for seq in enumerateIndexSeqs(ngramMaps[1:], sentenceIndex, index + 1):
|
||||
assert seq[0] > index
|
||||
yield [index] + seq
|
||||
|
||||
|
@ -2,18 +2,23 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# average embeddings of special null words for RDLM.
|
||||
# Usage: average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
|
||||
"""Average embeddings of special null words for RDLM.
|
||||
|
||||
Usage:
|
||||
average_null_embedding.py NPLM_PATH INPUT_MODEL TRAINING_FILE OUTPUT_MODEL
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy
|
||||
|
||||
|
||||
def load_model(model_file):
|
||||
return nplm.NeuralLM.from_file(model_file)
|
||||
|
||||
|
||||
def get_weights(path, vocab, len_context):
|
||||
d = [[0]*vocab for i in range(len_context)]
|
||||
d = [[0] * vocab for i in range(len_context)]
|
||||
for line in open(path):
|
||||
for i, word in enumerate(line.split()[:-1]):
|
||||
d[i][int(word)] += 1
|
||||
@ -26,20 +31,23 @@ if __name__ == "__main__":
|
||||
training_instances = sys.argv[3]
|
||||
model_output = sys.argv[4]
|
||||
|
||||
sys.path.append(os.path.join(nplm_path,'python'))
|
||||
sys.path.append(os.path.join(nplm_path, 'python'))
|
||||
import nplm
|
||||
|
||||
model = load_model(model_input)
|
||||
|
||||
len_context = len(open(training_instances).readline().split())-1
|
||||
len_context = len(open(training_instances).readline().split()) - 1
|
||||
|
||||
sys.stderr.write('reading ngrams...')
|
||||
weights = numpy.array(get_weights(training_instances, len(model.input_embeddings), len_context))
|
||||
weights = numpy.array(
|
||||
get_weights(
|
||||
training_instances, len(model.input_embeddings), len_context))
|
||||
sys.stderr.write('done\n')
|
||||
|
||||
for i in range(len_context):
|
||||
index = model.word_to_index_input['<null_{0}>'.format(i)]
|
||||
model.input_embeddings[index] = numpy.average(numpy.array(model.input_embeddings), weights=weights[i], axis=0)
|
||||
model.input_embeddings[index] = numpy.average(
|
||||
numpy.array(model.input_embeddings), weights=weights[i], axis=0)
|
||||
sys.stderr.write('writing model...')
|
||||
model.to_file(open(model_output,'w'))
|
||||
model.to_file(open(model_output, 'w'))
|
||||
sys.stderr.write('done\n')
|
||||
|
@ -2,17 +2,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# extract syntactic n-grams from dependency treebank in Moses XML format for training RDLM
|
||||
# expected format can be produced with mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
|
||||
# OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped to 0 (<unk>)
|
||||
"""
|
||||
Extract syntactic n-grams from dependency treebank in Moses XML format for
|
||||
training RDLM.
|
||||
|
||||
Expected format can be produced with
|
||||
mosesdecoder/scripts/training/wrapper/conll2mosesxml.py
|
||||
|
||||
OOV terminal symbols are mapped to preterminal; OOV nonterminals are mapped
|
||||
to 0 (<unk>)
|
||||
"""
|
||||
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import sys
|
||||
import codecs
|
||||
import argparse
|
||||
|
||||
# hack for python2/3 compatibility
|
||||
# Hack for python2/3 compatibility
|
||||
from io import open
|
||||
|
||||
argparse.open = open
|
||||
|
||||
try:
|
||||
@ -20,46 +28,84 @@ try:
|
||||
except ImportError:
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
|
||||
|
||||
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
|
||||
help='input file (default: standard input).')
|
||||
parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
|
||||
help='output file (default: standard output).')
|
||||
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
|
||||
choices=['label', 'head'], required=True)
|
||||
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
|
||||
help='input layer vocabulary file (one item per line; first line \'<unk>\')')
|
||||
parser.add_argument('--output_vocab', metavar='PATH', type=str,
|
||||
help='output layer vocabulary file (default: use input layer vocabulary)')
|
||||
parser.add_argument('--left_context', metavar='INT', type=int,
|
||||
help='size of context vector for left siblings (default: %(default)s)', default=3)
|
||||
parser.add_argument('--right_context', metavar='INT', type=int,
|
||||
help='size of context vector for right siblings (default: %(default)s)', default=0)
|
||||
parser.add_argument('--up_context', metavar='INT', type=int,
|
||||
help='size of context vector for ancestors (default: %(default)s)', default=2)
|
||||
parser.add_argument('--glue_symbol', metavar='STR', type=str, default='Q',
|
||||
help='glue symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--start_symbol', metavar='STR', type=str, default='SSTART',
|
||||
help='sentence start symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--end_symbol', metavar='STR', type=str, default='SEND',
|
||||
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
|
||||
parser.add_argument('--ptkvz', action='store_true',
|
||||
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Extract syntactic n-grams from parsed corpus in "
|
||||
"Moses XML format for training RDLM"))
|
||||
|
||||
parser.add_argument(
|
||||
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
||||
metavar='PATH',
|
||||
help='Input file (default: standard input).')
|
||||
parser.add_argument(
|
||||
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
|
||||
metavar='PATH',
|
||||
help='Output file (default: standard output).')
|
||||
parser.add_argument(
|
||||
'--mode', type=str, choices=['label', 'head'], required=True,
|
||||
help='Predict terminals (head) or dependency labels (label).')
|
||||
parser.add_argument(
|
||||
'--vocab', metavar='PATH', type=str, required=True,
|
||||
help=(
|
||||
"Input layer vocabulary file (one item per line; "
|
||||
"first line '<unk>')"))
|
||||
parser.add_argument(
|
||||
'--output_vocab', metavar='PATH', type=str,
|
||||
help=(
|
||||
"Output layer vocabulary file "
|
||||
"(default: use input layer vocabulary)"))
|
||||
parser.add_argument(
|
||||
'--left_context', metavar='INT', type=int, default=3,
|
||||
help=(
|
||||
"Size of context vector for left siblings "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--right_context', metavar='INT', type=int, default=0,
|
||||
help=(
|
||||
"Size of context vector for right siblings "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--up_context', metavar='INT', type=int, default=2,
|
||||
help=(
|
||||
"Size of context vector for ancestors "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--glue_symbol', metavar='STR', type=str, default='Q',
|
||||
help=(
|
||||
"Glue symbol. Will be skipped during extraction "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--start_symbol', metavar='STR', type=str, default='SSTART',
|
||||
help=(
|
||||
"Sentence start symbol. Will be skipped during extraction "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--end_symbol', metavar='STR', type=str, default='SEND',
|
||||
help=(
|
||||
"Sentence end symbol. Will be skipped during extraction "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
'--ptkvz', action='store_true',
|
||||
help=(
|
||||
"Special rule for German dependency trees: "
|
||||
"concatenate separable verb prefix and verb."))
|
||||
return parser
|
||||
|
||||
|
||||
def escape_text(s):
|
||||
|
||||
s = s.replace('|','|') # factor separator
|
||||
s = s.replace('[','[') # syntax non-terminal
|
||||
s = s.replace(']',']') # syntax non-terminal
|
||||
s = s.replace('\'',''') # xml special character
|
||||
s = s.replace('"','"') # xml special character
|
||||
s = s.replace('|', '|') # factor separator
|
||||
s = s.replace('[', '[') # syntax non-terminal
|
||||
s = s.replace(']', ']') # syntax non-terminal
|
||||
s = s.replace('\'', ''') # xml special character
|
||||
s = s.replace('"', '"') # xml special character
|
||||
return s
|
||||
|
||||
# deterministic heuristic to get head of subtree
|
||||
|
||||
def get_head(xml, add_ptkvz):
|
||||
"""Deterministic heuristic to get head of subtree."""
|
||||
head = None
|
||||
preterminal = None
|
||||
for child in xml:
|
||||
@ -77,23 +123,38 @@ def get_head(xml, add_ptkvz):
|
||||
|
||||
return head, preterminal
|
||||
|
||||
def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, parent_labels=None):
|
||||
|
||||
def get_syntactic_ngrams(xml, options, vocab, output_vocab,
|
||||
parent_heads=None, parent_labels=None):
|
||||
|
||||
if len(xml):
|
||||
|
||||
# skip glue rules
|
||||
if xml.get('label') == options.glue_symbol or xml.get('label') == options.start_symbol or xml.get('label') == options.end_symbol:
|
||||
for child in xml:
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
return
|
||||
# Skip glue rules.
|
||||
skip_glue_labels = [
|
||||
options.glue_symbol,
|
||||
options.start_symbol,
|
||||
options.end_symbo,
|
||||
]
|
||||
if xml.get('label') in skip_glue_labels:
|
||||
for child in xml:
|
||||
get_syntactic_ngrams(
|
||||
child, options, vocab, output_vocab, parent_heads,
|
||||
parent_labels)
|
||||
return
|
||||
|
||||
# skip virtual nodes
|
||||
if xml.get('label') == '<stop_label>' or xml.get('label') == '<start_label>':
|
||||
return
|
||||
# Skip virtual nodes.
|
||||
skip_virtual_labels = [
|
||||
'<stop_label>',
|
||||
'<start_label>',
|
||||
]
|
||||
if xml.get('label') in skip_virtual_labels:
|
||||
return
|
||||
|
||||
if not parent_heads:
|
||||
parent_heads = [vocab.get('<root_head>', 0)] * options.up_context
|
||||
parent_labels = [vocab.get('<root_label>', 0)] * options.up_context
|
||||
parent_heads = (
|
||||
[vocab.get('<root_head>', 0)] * options.up_context)
|
||||
parent_labels = (
|
||||
[vocab.get('<root_label>', 0)] * options.up_context)
|
||||
|
||||
head, preterminal = get_head(xml, options.ptkvz)
|
||||
if not head:
|
||||
@ -119,7 +180,8 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
|
||||
options.output.write(' '.join(map(str, int_list)) + '\n')
|
||||
elif options.mode == 'head' and not head == '<dummy_head>':
|
||||
int_list.append(vocab.get(label, 0))
|
||||
int_list.append(output_vocab.get(head, output_vocab.get(preterminal, 0)))
|
||||
int_list.append(
|
||||
output_vocab.get(head, output_vocab.get(preterminal, 0)))
|
||||
options.output.write(' '.join(map(str, int_list)) + '\n')
|
||||
|
||||
parent_heads.append(vocab.get(head, 0))
|
||||
@ -130,28 +192,29 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
|
||||
if options.right_context:
|
||||
start = ET.Element('tree')
|
||||
start2 = ET.Element('tree')
|
||||
start.set('label','<start_label>')
|
||||
start2.set('label','XY')
|
||||
start.set('label', '<start_label>')
|
||||
start2.set('label', 'XY')
|
||||
start2.text = '<start_head>'
|
||||
start.append(start2)
|
||||
xml.insert(0,start)
|
||||
xml.insert(0, start)
|
||||
if options.left_context:
|
||||
end = ET.Element('tree')
|
||||
end2 = ET.Element('tree')
|
||||
end.set('label','<stop_label>')
|
||||
end2.set('label','XY')
|
||||
end.set('label', '<stop_label>')
|
||||
end2.set('label', 'XY')
|
||||
end2.text = '<stop_head>'
|
||||
end.append(end2)
|
||||
xml.append(end)
|
||||
|
||||
|
||||
heads = []
|
||||
preterminals = []
|
||||
labels = []
|
||||
|
||||
for child in xml:
|
||||
if not len(child):
|
||||
# mark that the previous sibling is the head of the structure (the head/label are not repeated because they're also head/label of the parent)
|
||||
# Mark that the previous sibling is the head of the
|
||||
# structure (the head/label are not repeated because they're
|
||||
# also head/label of the parent).
|
||||
head_child = '<head_head>'
|
||||
preterminal_child = head_child
|
||||
child_label = '<head_label>'
|
||||
@ -166,37 +229,60 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
|
||||
preterminals.append(preterminal_child)
|
||||
labels.append(child_label)
|
||||
|
||||
heads_idx = [vocab.get(heads[i], vocab.get(preterminals[i], 0)) for i in range(len(heads))]
|
||||
labels_idx = [vocab.get(labels[i], 0) for i in range(len(labels))]
|
||||
heads_idx = [
|
||||
vocab.get(heads[i], vocab.get(preterminals[i], 0))
|
||||
for i in range(len(heads))]
|
||||
labels_idx = [
|
||||
vocab.get(labels[i], 0)
|
||||
for i in range(len(labels))]
|
||||
|
||||
#ancestor context is same for all children
|
||||
# Ancestor context is the same for all children.
|
||||
up_heads = parent_heads[-options.up_context:]
|
||||
up_labels = parent_labels[-options.up_context:]
|
||||
|
||||
for i,child in enumerate(xml):
|
||||
skip_special_heads = [
|
||||
'<dummy_head>',
|
||||
'<head_head>',
|
||||
'<stop_head>',
|
||||
'<start_head>',
|
||||
]
|
||||
for i, child in enumerate(xml):
|
||||
|
||||
# skip some special symbols, but recursively extract n-grams for its children
|
||||
if options.mode == 'head' and (heads[i] == '<dummy_head>' or heads[i] == '<head_head>' or heads[i] == '<stop_head>' or heads[i] == '<start_head>'):
|
||||
# Skip some special symbols, but recursively extract n-grams
|
||||
# for its children.
|
||||
if options.mode == 'head' and heads[i] in skip_special_heads:
|
||||
parent_heads.append(vocab.get(heads[i], 0))
|
||||
parent_labels.append(vocab.get(labels[i], 0))
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
get_syntactic_ngrams(
|
||||
child, options, vocab, output_vocab, parent_heads,
|
||||
parent_labels)
|
||||
parent_heads.pop()
|
||||
parent_labels.pop()
|
||||
continue
|
||||
|
||||
previous_heads = heads_idx[max(0,i-options.left_context):i]
|
||||
previous_labels = labels_idx[max(0,i-options.left_context):i]
|
||||
previous_heads = heads_idx[max(0, i - options.left_context):i]
|
||||
previous_labels = labels_idx[max(0, i - options.left_context):i]
|
||||
|
||||
subsequent_heads = heads_idx[i+1:i+options.right_context+1]
|
||||
subsequent_labels = labels_idx[i+1:i+options.right_context+1]
|
||||
subsequent_heads = heads_idx[i + 1:i + options.right_context + 1]
|
||||
subsequent_labels = labels_idx[i + 1:i + options.right_context + 1]
|
||||
|
||||
if len(previous_heads) < options.left_context:
|
||||
previous_heads = [start_head_idx] * (options.left_context-len(previous_heads)) + previous_heads
|
||||
previous_labels = [start_label_idx] * (options.left_context-len(previous_labels)) + previous_labels
|
||||
previous_heads = (
|
||||
[start_head_idx] *
|
||||
(options.left_context - len(previous_heads)) +
|
||||
previous_heads)
|
||||
previous_labels = (
|
||||
[start_label_idx] *
|
||||
(options.left_context - len(previous_labels)) +
|
||||
previous_labels)
|
||||
|
||||
if len(subsequent_heads) < options.right_context:
|
||||
subsequent_heads = subsequent_heads + [stop_head_idx] * (options.right_context-len(subsequent_heads))
|
||||
subsequent_labels = subsequent_labels + [stop_label_idx] * (options.right_context-len(subsequent_labels))
|
||||
subsequent_heads += (
|
||||
[stop_head_idx] *
|
||||
(options.right_context - len(subsequent_heads)))
|
||||
subsequent_labels += (
|
||||
[stop_label_idx] *
|
||||
(options.right_context - len(subsequent_labels)))
|
||||
|
||||
int_list = []
|
||||
int_list.extend(previous_heads)
|
||||
@ -209,14 +295,19 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
|
||||
int_list.append(output_vocab.get(labels[i], 0))
|
||||
elif options.mode == 'head':
|
||||
int_list.append(vocab.get(labels[i], 0))
|
||||
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
|
||||
int_list.append(
|
||||
output_vocab.get(
|
||||
heads[i], output_vocab.get(preterminals[i], 0)))
|
||||
|
||||
options.output.write(' '.join(map(str, int_list)) + '\n')
|
||||
|
||||
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
|
||||
parent_heads.append(
|
||||
vocab.get(heads[i], vocab.get(preterminals[i], 0)))
|
||||
parent_labels.append(vocab.get(labels[i], 0))
|
||||
|
||||
get_syntactic_ngrams(child, options, vocab, output_vocab, parent_heads, parent_labels)
|
||||
get_syntactic_ngrams(
|
||||
child, options, vocab, output_vocab, parent_heads,
|
||||
parent_labels)
|
||||
|
||||
parent_heads.pop()
|
||||
parent_labels.pop()
|
||||
@ -224,15 +315,17 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
|
||||
|
||||
def load_vocab(path):
|
||||
v = {}
|
||||
for i,line in enumerate(open(path, encoding="UTF-8")):
|
||||
for i, line in enumerate(open(path, encoding="UTF-8")):
|
||||
v[line.strip()] = i
|
||||
return v
|
||||
|
||||
|
||||
def main(options):
|
||||
vocab = load_vocab(options.vocab)
|
||||
|
||||
if options.output_vocab is None:
|
||||
sys.stderr.write('no output vocabulary specified; using input vocabulary\n')
|
||||
sys.stderr.write(
|
||||
"No output vocabulary specified; using input vocabulary.\n")
|
||||
output_vocab = vocab
|
||||
else:
|
||||
output_vocab = load_vocab(options.output_vocab)
|
||||
@ -275,4 +368,4 @@ if __name__ == '__main__':
|
||||
parser = create_parser()
|
||||
options = parser.parse_args()
|
||||
|
||||
main(options)
|
||||
main(options)
|
||||
|
@ -9,6 +9,7 @@ import sys
|
||||
import codecs
|
||||
import argparse
|
||||
from collections import Counter
|
||||
from textwrap import dedent
|
||||
|
||||
# hack for python2/3 compatibility
|
||||
from io import open
|
||||
@ -19,37 +20,49 @@ try:
|
||||
except ImportError:
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
|
||||
HELP_TEXT = dedent("""\
|
||||
generate 5 vocabulary files from parsed corpus in moses XML format
|
||||
[PREFIX].special: around 40 symbols reserved for RDLM
|
||||
[PREFIX].preterminals: preterminal symbols
|
||||
[PREFIX].nonterminals: nonterminal symbols (which are not preterminal)
|
||||
[PREFIX].terminals: terminal symbols
|
||||
[PREFIX].all: all of the above
|
||||
""")
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=HELP_TEXT)
|
||||
|
||||
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
|
||||
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
|
||||
help_text += " [PREFIX].preterminals: preterminal symbols\n";
|
||||
help_text += " [PREFIX].nonterminals: nonterminal symbols (which are not preterminal)\n";
|
||||
help_text += " [PREFIX].terminals: terminal symbols\n";
|
||||
help_text += " [PREFIX].all: all of the above\n"
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
|
||||
|
||||
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
|
||||
help='input text (default: standard input).')
|
||||
parser.add_argument('--output', '-o', type=str, default='vocab', metavar='PREFIX',
|
||||
help='output prefix (default: "vocab")')
|
||||
parser.add_argument('--ptkvz', action="store_true",
|
||||
help='special rule for German dependency trees: attach separable verb prefixes to verb')
|
||||
parser.add_argument(
|
||||
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
|
||||
metavar='PATH',
|
||||
help="Input text (default: standard input).")
|
||||
parser.add_argument(
|
||||
'--output', '-o', type=str, default='vocab', metavar='PREFIX',
|
||||
help="Output prefix (default: 'vocab')")
|
||||
parser.add_argument(
|
||||
'--ptkvz', action="store_true",
|
||||
help=(
|
||||
"Special rule for German dependency trees: attach separable "
|
||||
"verb prefixes to verb."))
|
||||
|
||||
return parser
|
||||
|
||||
def escape_text(s):
|
||||
|
||||
s = s.replace('|','|') # factor separator
|
||||
s = s.replace('[','[') # syntax non-terminal
|
||||
s = s.replace(']',']') # syntax non-terminal
|
||||
s = s.replace('\'',''') # xml special character
|
||||
s = s.replace('"','"') # xml special character
|
||||
def escape_text(s):
|
||||
s = s.replace('|', '|') # factor separator
|
||||
s = s.replace('[', '[') # syntax non-terminal
|
||||
s = s.replace(']', ']') # syntax non-terminal
|
||||
s = s.replace('\'', ''') # xml special character
|
||||
s = s.replace('"', '"') # xml special character
|
||||
return s
|
||||
|
||||
# deterministic heuristic to get head of subtree
|
||||
|
||||
def get_head(xml, args):
|
||||
"""Deterministic heuristic to get head of subtree."""
|
||||
head = None
|
||||
preterminal = None
|
||||
for child in xml:
|
||||
@ -67,6 +80,7 @@ def get_head(xml, args):
|
||||
|
||||
return head, preterminal
|
||||
|
||||
|
||||
def get_vocab(xml, args):
|
||||
|
||||
if len(xml):
|
||||
@ -88,6 +102,7 @@ def get_vocab(xml, args):
|
||||
continue
|
||||
get_vocab(child, args)
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
global heads
|
||||
@ -111,10 +126,24 @@ def main(args):
|
||||
get_vocab(xml, args)
|
||||
i += 1
|
||||
|
||||
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
|
||||
special_tokens = [
|
||||
'<unk>',
|
||||
'<null>',
|
||||
'<null_label>',
|
||||
'<null_head>',
|
||||
'<head_label>',
|
||||
'<root_label>',
|
||||
'<start_label>',
|
||||
'<stop_label>',
|
||||
'<head_head>',
|
||||
'<root_head>',
|
||||
'<start_head>',
|
||||
'<dummy_head>',
|
||||
'<stop_head>',
|
||||
]
|
||||
|
||||
for i in range(30):
|
||||
special_tokens.append('<null_{0}>'.format(i))
|
||||
special_tokens.append('<null_{0}>'.format(i))
|
||||
|
||||
f = open(args.output + '.special', 'w', encoding='UTF-8')
|
||||
for item in special_tokens:
|
||||
@ -158,7 +187,6 @@ def main(args):
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
|
@ -9,7 +9,6 @@ import subprocess
|
||||
import sys
|
||||
import os
|
||||
import codecs
|
||||
import copy
|
||||
|
||||
# ../bilingual-lm
|
||||
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
|
||||
@ -17,143 +16,224 @@ import train_nplm
|
||||
import extract_vocab
|
||||
import extract_syntactic_ngrams
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
|
||||
parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
|
||||
parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
|
||||
parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
|
||||
parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
|
||||
parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
|
||||
parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
|
||||
parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
|
||||
parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
|
||||
parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
|
||||
parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
|
||||
parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
|
||||
parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
|
||||
parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
|
||||
parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
|
||||
parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
|
||||
parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
|
||||
parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
|
||||
parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
|
||||
parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
|
||||
parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
|
||||
parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
|
||||
parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
|
||||
parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
|
||||
parser.add_argument("--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--working-dir", dest="working_dir", metavar="PATH")
|
||||
parser.add_argument(
|
||||
"--corpus", dest="corpus_stem", metavar="PATH", help="Input file.")
|
||||
parser.add_argument(
|
||||
"--nplm-home", dest="nplm_home", metavar="PATH", required=True,
|
||||
help="Location of NPLM.")
|
||||
parser.add_argument(
|
||||
"--epochs", dest="epochs", type=int, metavar="INT",
|
||||
help="Number of training epochs (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--up-context-size", dest="up_context_size", type=int, metavar="INT",
|
||||
help="Size of ancestor context (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--left-context-size", dest="left_context_size", type=int, metavar="INT",
|
||||
help="Size of sibling context (left) (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--right-context-size", dest="right_context_size", type=int,
|
||||
metavar="INT",
|
||||
help="Size of sibling context (right) (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--mode", dest="mode", choices=['head', 'label'], required=True,
|
||||
help="Type of RDLM to train (both are required for decoding).")
|
||||
parser.add_argument(
|
||||
"--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
|
||||
help="Minibatch size (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--noise", dest="noise", type=int, metavar="INT",
|
||||
help="Number of noise samples for NCE (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--hidden", dest="hidden", type=int, metavar="INT",
|
||||
help=(
|
||||
"Size of hidden layer (0 for single hidden layer) "
|
||||
"(default: %(default)s)"))
|
||||
parser.add_argument(
|
||||
"--input-embedding", dest="input_embedding", type=int, metavar="INT",
|
||||
help="Size of input embedding layer (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--output-embedding", dest="output_embedding", type=int, metavar="INT",
|
||||
help="Size of output embedding layer (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--threads", "-t", dest="threads", type=int, metavar="INT",
|
||||
help="Number of threads (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--output-model", dest="output_model", metavar="PATH",
|
||||
help="Name of output model (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--output-dir", dest="output_dir", metavar="PATH",
|
||||
help="Output directory (default: same as working-dir).")
|
||||
parser.add_argument(
|
||||
"--config-options-file", dest="config_options_file", metavar="PATH")
|
||||
parser.add_argument(
|
||||
"--log-file", dest="log_file", metavar="PATH",
|
||||
help="Log file to write to (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--validation-corpus", dest="validation_corpus", metavar="PATH",
|
||||
help="Validation file (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--activation-function", dest="activation_fn",
|
||||
choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
|
||||
help="Activation function (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
|
||||
help="Learning rate (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--input-words-file", dest="input_words_file", metavar="PATH",
|
||||
help="Input vocabulary (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--output-words-file", dest="output_words_file", metavar="PATH",
|
||||
help="Output vocabulary (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
|
||||
help="Input vocabulary size (default: %(default)s).")
|
||||
parser.add_argument(
|
||||
"--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
|
||||
help="Output vocabulary size (default: %(default)s).")
|
||||
|
||||
|
||||
parser.set_defaults(
|
||||
working_dir = "working"
|
||||
,corpus_stem = "train"
|
||||
,nplm_home = "/home/bhaddow/tools/nplm"
|
||||
,epochs = 2
|
||||
,up_context_size = 2
|
||||
,left_context_size = 3
|
||||
,right_context_size = 0
|
||||
,minibatch_size=1000
|
||||
,noise=100
|
||||
,hidden=0
|
||||
,mode='head'
|
||||
,input_embedding=150
|
||||
,output_embedding=750
|
||||
,threads=4
|
||||
,output_model = "train"
|
||||
,output_dir = None
|
||||
,config_options_file = "config"
|
||||
,log_file = "log"
|
||||
,validation_corpus = None
|
||||
,activation_fn = "rectifier"
|
||||
,learning_rate = 1
|
||||
,input_words_file = None
|
||||
,output_words_file = None
|
||||
,input_vocab_size = 500000
|
||||
,output_vocab_size = 500000
|
||||
)
|
||||
working_dir="working",
|
||||
corpus_stem="train",
|
||||
nplm_home="/home/bhaddow/tools/nplm",
|
||||
epochs=2,
|
||||
up_context_size=2,
|
||||
left_context_size=3,
|
||||
right_context_size=0,
|
||||
minibatch_size=1000,
|
||||
noise=100,
|
||||
hidden=0,
|
||||
mode='head',
|
||||
input_embedding=150,
|
||||
output_embedding=750,
|
||||
threads=4,
|
||||
output_model="train",
|
||||
output_dir=None,
|
||||
config_options_file="config",
|
||||
log_file="log",
|
||||
validation_corpus=None,
|
||||
activation_fn="rectifier",
|
||||
learning_rate=1,
|
||||
input_words_file=None,
|
||||
output_words_file=None,
|
||||
input_vocab_size=500000,
|
||||
output_vocab_size=500000)
|
||||
|
||||
|
||||
def prepare_vocabulary(options):
|
||||
vocab_prefix = os.path.join(options.working_dir, 'vocab')
|
||||
extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
|
||||
extract_vocab.main(extract_vocab_options)
|
||||
vocab_prefix = os.path.join(options.working_dir, 'vocab')
|
||||
extract_vocab_options = extract_vocab.create_parser().parse_args(
|
||||
['--input', options.corpus_stem, '--output', vocab_prefix])
|
||||
extract_vocab.main(extract_vocab_options)
|
||||
|
||||
if options.input_words_file is None:
|
||||
options.input_words_file = vocab_prefix + '.input'
|
||||
orig = vocab_prefix + '.all'
|
||||
filtered_vocab = open(orig).readlines()
|
||||
if options.input_vocab_size:
|
||||
filtered_vocab = filtered_vocab[:options.input_vocab_size]
|
||||
open(options.input_words_file,'w').writelines(filtered_vocab)
|
||||
if options.input_words_file is None:
|
||||
options.input_words_file = vocab_prefix + '.input'
|
||||
orig = vocab_prefix + '.all'
|
||||
filtered_vocab = open(orig).readlines()
|
||||
if options.input_vocab_size:
|
||||
filtered_vocab = filtered_vocab[:options.input_vocab_size]
|
||||
open(options.input_words_file, 'w').writelines(filtered_vocab)
|
||||
|
||||
if options.output_words_file is None:
|
||||
options.output_words_file = vocab_prefix + '.output'
|
||||
if options.mode == 'label':
|
||||
blacklist = [
|
||||
'<null',
|
||||
'<root',
|
||||
'<start_head',
|
||||
'<dummy',
|
||||
'<head_head',
|
||||
'<stop_head',
|
||||
]
|
||||
orig = vocab_prefix + '.special'
|
||||
filtered_vocab = open(orig).readlines()
|
||||
orig = vocab_prefix + '.nonterminals'
|
||||
filtered_vocab += open(orig).readlines()
|
||||
filtered_vocab = [
|
||||
word
|
||||
for word in filtered_vocab
|
||||
if not any(word.startswith(prefix) for prefix in blacklist)]
|
||||
if options.output_vocab_size:
|
||||
filtered_vocab = filtered_vocab[:options.output_vocab_size]
|
||||
else:
|
||||
orig = vocab_prefix + '.all'
|
||||
filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
|
||||
open(options.output_words_file, 'w').writelines(filtered_vocab)
|
||||
|
||||
if options.output_words_file is None:
|
||||
options.output_words_file = vocab_prefix + '.output'
|
||||
if options.mode == 'label':
|
||||
blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
|
||||
orig = vocab_prefix + '.special'
|
||||
filtered_vocab = open(orig).readlines()
|
||||
orig = vocab_prefix + '.nonterminals'
|
||||
filtered_vocab += open(orig).readlines()
|
||||
filtered_vocab = [word for word in filtered_vocab if not any(word.startswith(prefix) for prefix in blacklist)]
|
||||
if options.output_vocab_size:
|
||||
filtered_vocab = filtered_vocab[:options.output_vocab_size]
|
||||
else:
|
||||
orig = vocab_prefix + '.all'
|
||||
filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
|
||||
open(options.output_words_file,'w').writelines(filtered_vocab)
|
||||
|
||||
def main(options):
|
||||
|
||||
options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
|
||||
if options.mode == 'head':
|
||||
options.ngram_size += 2
|
||||
elif options.mode == 'label':
|
||||
options.ngram_size += 1
|
||||
options.ngram_size = (
|
||||
2 * options.up_context_size +
|
||||
2 * options.left_context_size +
|
||||
2 * options.right_context_size
|
||||
)
|
||||
if options.mode == 'head':
|
||||
options.ngram_size += 2
|
||||
elif options.mode == 'label':
|
||||
options.ngram_size += 1
|
||||
|
||||
if options.input_words_file is None or options.output_words_file is None:
|
||||
sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
|
||||
prepare_vocabulary(options)
|
||||
if options.input_words_file is None or options.output_words_file is None:
|
||||
sys.stderr.write(
|
||||
"Either input vocabulary or output vocabulary not specified: "
|
||||
"extracting vocabulary from training text.\n")
|
||||
prepare_vocabulary(options)
|
||||
|
||||
extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
|
||||
'--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
|
||||
'--vocab', options.input_words_file,
|
||||
'--output_vocab', options.output_words_file,
|
||||
'--right_context', str(options.right_context_size),
|
||||
'--left_context', str(options.left_context_size),
|
||||
'--up_context', str(options.up_context_size),
|
||||
'--mode', options.mode
|
||||
])
|
||||
sys.stderr.write('extracting syntactic n-grams\n')
|
||||
extract_syntactic_ngrams.main(extract_options)
|
||||
|
||||
if options.validation_corpus:
|
||||
extract_options.input = open(options.validation_corpus)
|
||||
options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))
|
||||
extract_options.output = open(options.validation_file + '.numberized', 'w')
|
||||
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
|
||||
extract_options = extract_syntactic_ngrams.create_parser().parse_args([
|
||||
'--input', options.corpus_stem,
|
||||
'--output', os.path.join(
|
||||
options.working_dir,
|
||||
os.path.basename(options.corpus_stem) + '.numberized'),
|
||||
'--vocab', options.input_words_file,
|
||||
'--output_vocab', options.output_words_file,
|
||||
'--right_context', str(options.right_context_size),
|
||||
'--left_context', str(options.left_context_size),
|
||||
'--up_context', str(options.up_context_size),
|
||||
'--mode', options.mode
|
||||
])
|
||||
sys.stderr.write('extracting syntactic n-grams\n')
|
||||
extract_syntactic_ngrams.main(extract_options)
|
||||
extract_options.output.close()
|
||||
|
||||
sys.stderr.write('training neural network\n')
|
||||
train_nplm.main(options)
|
||||
if options.validation_corpus:
|
||||
extract_options.input = open(options.validation_corpus)
|
||||
options.validation_file = os.path.join(
|
||||
options.working_dir, os.path.basename(options.validation_corpus))
|
||||
extract_options.output = open(
|
||||
options.validation_file + '.numberized', 'w')
|
||||
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
|
||||
extract_syntactic_ngrams.main(extract_options)
|
||||
extract_options.output.close()
|
||||
|
||||
sys.stderr.write('training neural network\n')
|
||||
train_nplm.main(options)
|
||||
|
||||
sys.stderr.write('averaging null words\n')
|
||||
ret = subprocess.call([
|
||||
os.path.join(sys.path[0], 'average_null_embedding.py'),
|
||||
options.nplm_home,
|
||||
os.path.join(
|
||||
options.output_dir,
|
||||
options.output_model + '.model.nplm.' + str(options.epochs)),
|
||||
os.path.join(
|
||||
options.working_dir,
|
||||
os.path.basename(options.corpus_stem) + '.numberized'),
|
||||
os.path.join(options.output_dir, options.output_model + '.model.nplm')
|
||||
])
|
||||
if ret:
|
||||
raise Exception("averaging null words failed")
|
||||
|
||||
sys.stderr.write('averaging null words\n')
|
||||
ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
|
||||
options.nplm_home,
|
||||
os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
|
||||
os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
|
||||
os.path.join(options.output_dir, options.output_model + '.model.nplm')
|
||||
])
|
||||
if ret:
|
||||
raise Exception("averaging null words failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if sys.version_info < (3, 0):
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||||
|
||||
options = parser.parse_args()
|
||||
main(options)
|
||||
if sys.version_info < (3, 0):
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||||
|
||||
options = parser.parse_args()
|
||||
main(options)
|
||||
|
@ -2,42 +2,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich
|
||||
|
||||
# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
|
||||
# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
|
||||
# which not all parsers produce.
|
||||
"""
|
||||
Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
|
||||
dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
|
||||
Moses XML format.
|
||||
|
||||
# usage: conll2mosesxml.py [--brackets] < input_file > output_file
|
||||
Note that the structure is built based on fields 9 and 10 (projective HEAD
|
||||
and RELATION), which not all parsers produce.
|
||||
|
||||
Usage: conll2mosesxml.py [--brackets] < input_file > output_file
|
||||
"""
|
||||
|
||||
from __future__ import print_function, unicode_literals
|
||||
import sys
|
||||
import re
|
||||
import codecs
|
||||
from collections import namedtuple,defaultdict
|
||||
from collections import (
|
||||
namedtuple,
|
||||
defaultdict,
|
||||
)
|
||||
from lxml import etree as ET
|
||||
|
||||
|
||||
Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
|
||||
Word = namedtuple(
|
||||
'Word',
|
||||
['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])
|
||||
|
||||
|
||||
def main(output_format='xml'):
|
||||
sentence = []
|
||||
|
||||
for line in sys.stdin:
|
||||
|
||||
# process sentence
|
||||
# Process sentence.
|
||||
if line == "\n":
|
||||
sentence.insert(0,[])
|
||||
sentence.insert(0, [])
|
||||
if is_projective(sentence):
|
||||
write(sentence,output_format)
|
||||
write(sentence, output_format)
|
||||
else:
|
||||
sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
|
||||
sys.stderr.write(
|
||||
' '.join(w.word for w in sentence[1:]) + '\n')
|
||||
sys.stdout.write('\n')
|
||||
sentence = []
|
||||
continue
|
||||
|
||||
try:
|
||||
pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
|
||||
except ValueError: # word may be unicode whitespace
|
||||
pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
|
||||
(
|
||||
pos,
|
||||
word,
|
||||
lemma,
|
||||
tag,
|
||||
tag2,
|
||||
morph,
|
||||
head,
|
||||
func,
|
||||
proj_head,
|
||||
proj_func,
|
||||
) = line.split()
|
||||
except ValueError: # Word may be unicode whitespace.
|
||||
(
|
||||
pos,
|
||||
word,
|
||||
lemma,
|
||||
tag,
|
||||
tag2,
|
||||
morph,
|
||||
head,
|
||||
func,
|
||||
proj_head,
|
||||
proj_func,
|
||||
) = re.split(' *\t*', line.strip())
|
||||
|
||||
word = escape_special_chars(word)
|
||||
lemma = escape_special_chars(lemma)
|
||||
@ -46,17 +80,20 @@ def main(output_format='xml'):
|
||||
proj_head = head
|
||||
proj_func = func
|
||||
|
||||
sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
|
||||
sentence.append(
|
||||
Word(
|
||||
int(pos), word, lemma, tag2, int(head), func, int(proj_head),
|
||||
proj_func))
|
||||
|
||||
|
||||
# this script performs the same escaping as escape-special-chars.perl in Moses.
|
||||
# most of it is done in function write(), but quotation marks need to be processed first
|
||||
# This script performs the same escaping as escape-special-chars.perl in
|
||||
# Moses. Most of it is done in function write(), but quotation marks need
|
||||
# to be processed first.
|
||||
def escape_special_chars(line):
|
||||
|
||||
line = line.replace('\'',''') # xml
|
||||
line = line.replace('"','"') # xml
|
||||
line = line.replace('[','[') # syntax non-terminal
|
||||
line = line.replace(']',']') # syntax non-terminal
|
||||
line = line.replace('\'', ''') # xml
|
||||
line = line.replace('"', '"') # xml
|
||||
line = line.replace('[', '[') # syntax non-terminal
|
||||
line = line.replace(']', ']') # syntax non-terminal
|
||||
|
||||
return line
|
||||
|
||||
@ -64,7 +101,7 @@ def escape_special_chars(line):
|
||||
# make a check if structure is projective
|
||||
def is_projective(sentence):
|
||||
dominates = defaultdict(set)
|
||||
for i,w in enumerate(sentence):
|
||||
for i, w in enumerate(sentence):
|
||||
dominates[i].add(i)
|
||||
if not i:
|
||||
continue
|
||||
@ -77,7 +114,7 @@ def is_projective(sentence):
|
||||
|
||||
for i in dominates:
|
||||
dependents = dominates[i]
|
||||
if max(dependents) - min(dependents) != len(dependents)-1:
|
||||
if max(dependents) - min(dependents) != len(dependents) - 1:
|
||||
sys.stderr.write("error: non-projective structure.\n")
|
||||
return False
|
||||
return True
|
||||
@ -86,24 +123,28 @@ def is_projective(sentence):
|
||||
def write(sentence, output_format='xml'):
|
||||
|
||||
if output_format == 'xml':
|
||||
tree = create_subtree(0,sentence)
|
||||
out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
|
||||
tree = create_subtree(0, sentence)
|
||||
out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')
|
||||
|
||||
if output_format == 'brackets':
|
||||
out = create_brackets(0,sentence)
|
||||
out = create_brackets(0, sentence)
|
||||
|
||||
out = out.replace('|','|') # factor separator
|
||||
out = out.replace('|', '|') # factor separator
|
||||
|
||||
out = out.replace('&apos;',''') # lxml is buggy if input is escaped
|
||||
out = out.replace('&quot;','"') # lxml is buggy if input is escaped
|
||||
out = out.replace('&#91;','[') # lxml is buggy if input is escaped
|
||||
out = out.replace('&#93;',']') # lxml is buggy if input is escaped
|
||||
# lxml is buggy if input is escaped:
|
||||
out = out.replace('&apos;', ''')
|
||||
# lxml is buggy if input is escaped:
|
||||
out = out.replace('&quot;', '"')
|
||||
# lxml is buggy if input is escaped:
|
||||
out = out.replace('&#91;', '[')
|
||||
# lxml is buggy if input is escaped:
|
||||
out = out.replace('&#93;', ']')
|
||||
|
||||
print(out)
|
||||
|
||||
# write node in Moses XML format
|
||||
def create_subtree(position, sentence):
|
||||
|
||||
def create_subtree(position, sentence):
|
||||
""""Write node in Moses XML format."""
|
||||
element = ET.Element('tree')
|
||||
|
||||
if position:
|
||||
@ -111,7 +152,7 @@ def create_subtree(position, sentence):
|
||||
else:
|
||||
element.set('label', 'sent')
|
||||
|
||||
for i in range(1,position):
|
||||
for i in range(1, position):
|
||||
if sentence[i].proj_head == position:
|
||||
element.append(create_subtree(i, sentence))
|
||||
|
||||
@ -144,7 +185,7 @@ def create_brackets(position, sentence):
|
||||
else:
|
||||
element = "[ sent "
|
||||
|
||||
for i in range(1,position):
|
||||
for i in range(1, position):
|
||||
if sentence[i].proj_head == position:
|
||||
element += create_brackets(i, sentence)
|
||||
|
||||
@ -167,7 +208,7 @@ def create_brackets(position, sentence):
|
||||
return element
|
||||
|
||||
if __name__ == '__main__':
|
||||
if sys.version_info < (3,0,0):
|
||||
if sys.version_info < (3, 0, 0):
|
||||
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||||
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||||
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||||
|
@ -10,17 +10,21 @@ import codecs
|
||||
|
||||
from lxml import etree as ET
|
||||
|
||||
|
||||
def escape(word):
|
||||
word = word.replace('|','|') # factor separator
|
||||
word = word.replace('[','[') # syntax non-terminal
|
||||
word = word.replace(']',']') # syntax non-terminal
|
||||
word = word.replace('\'',''')
|
||||
word = word.replace('\"','"')
|
||||
# Factor separator:
|
||||
word = word.replace('|', '|')
|
||||
# Syntax non-terminal:
|
||||
word = word.replace('[', '[')
|
||||
# Syntax non-terminal:
|
||||
word = word.replace(']', ']')
|
||||
word = word.replace('\'', ''')
|
||||
word = word.replace('\"', '"')
|
||||
|
||||
return word
|
||||
|
||||
def make_brackets(xml):
|
||||
|
||||
def make_brackets(xml):
|
||||
out = ' [' + xml.get('label')
|
||||
|
||||
if xml.text and xml.text.strip():
|
||||
|
Loading…
Reference in New Issue
Block a user