Fix some python lint.

I used mainly pocketlint, a very good Python linter, but also Syntastic,
a vim plugin.  Didn't get anywhere near fixing all of Syntastic's complaints
though.

Once I've cleaned up all (or at least most) of the Python lint, we can
start doing regular automated lint checks and keep the code clean.
This commit is contained in:
Jeroen Vermeulen 2015-05-16 14:58:03 +07:00
parent f1ed14eb33
commit 0ffe79579e
10 changed files with 629 additions and 481 deletions

View File

@ -2,12 +2,12 @@
"""
The Gacha filter cleans out sentence pairs that have global character mean
lower than a certain threshold.
Use this cleaner to produce low quantity of high quality sentence pairs.
lower than a certain threshold.
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
Use this cleaner to produce low quantity of high quality sentence pairs.
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
This is inspired by the global character mean that is used in the Gale-Church
@ -27,8 +27,8 @@ USAGE:
$ python3 gacha_filter.py train.en train.de
Outputs to STDOUT a separated lines of the source and target sentence pairs.
You can simply cut the file after that.
Outputs to STDOUT a separated lines of the source and target sentence pairs.
You can simply cut the file after that.
$ python3 gacha_filter.py train.en train.de > train.en-de
$ cut -f1 train.en-de > train.clean.en
@ -37,21 +37,27 @@ You can simply cut the file after that.
You can also allow lower threshold to yield more lines:
$ python3 gacha_filter.py train.en train.de 0.05
Default threshold is set to 0.2.
"""
import io, subprocess
import io
import subprocess
red = '\033[01;31m'
native = '\033[m'
def err_msg(txt):
return red+txt+native
return red + txt + native
def num_char(filename):
return float(subprocess.Popen(["wc", "-m", filename],
stdout=subprocess.PIPE).stdout.read().split()[0])
return float(
subprocess.Popen(
["wc", "-m", filename],
stdout=subprocess.PIPE).stdout.read().split()[0])
def gacha_mean(sourcefile, targetfile):
"""
@ -60,35 +66,40 @@ def gacha_mean(sourcefile, targetfile):
"""
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
c = num_char(sourcefile) / num_char(targetfile)
sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
sys.stderr.write(err_msg('Filtering starts ...\n'))
return c
def io_open(path):
"""Open text file at `path` as a read-only, with UTF-8 encoding."""
return io.open(path, 'r', encoding='utf8')
def main(sourcefile, targetfile, threshold=0.2):
# Calculates Gacha mean.
c = gacha_mean(sourcefile, targetfile)
# Calculates lower and upperbound for filtering
threshold = float(threshold)
lowerbound = (1-threshold) * c
upperbound = (1+threshold) * c
lowerbound = (1 - threshold) * c
upperbound = (1 + threshold) * c
# Start filtering sentences.
with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
io.open(targetfile, 'r', encoding='utf8') as trgfin:
with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
for s, t in zip(srcfin, trgfin):
if lowerbound < len(s) / float(len(t)) < upperbound:
print(u"{}\t{}".format(s.strip(),t.strip()))
print(u"{}\t{}".format(s.strip(), t.strip()))
if __name__ == '__main__':
import sys
if len(sys.argv) not in range(3,5):
if len(sys.argv) not in range(3, 5):
usage_msg = err_msg('Usage: python3 %s srcfile trgfile (threshold)\n'
% sys.argv[0])
example_msg = err_msg('Example: python3 %s ~/Europarl.de-en.de '
'~/Europarl.de-en.en 0.4\n' % sys.argv[0])
sys.stderr.write(usage_msg)
sys.stderr.write(example_msg)
sys.exit(1)
main(*sys.argv[1:])

View File

@ -3,36 +3,50 @@ import sys
import numpy
import argparse
parser = argparse.ArgumentParser(description='Set input embedding of <null> token to weighted average of all input embeddings')
parser.add_argument("-p", "--nplm-python-path", type=str, dest="nplm_python_path", default='/mnt/gna0/rsennrich/tools/nplm/python')
parser.add_argument("-i", "--input-model", type=str, dest="input_model", required=True)
parser.add_argument("-o", "--output-model", type=str, dest="output_model", required=True)
parser.add_argument("-n", "--null-token-index", type=int, dest="null_idx", default=-1)
parser.add_argument("-t", "--training-ngrams", type=str, dest="training_ngrams", required=True)
parser = argparse.ArgumentParser(
description=(
"Set input embedding of <null> token to weighted average "
"of all input embeddings"))
parser.add_argument(
"-p", "--nplm-python-path", type=str, dest="nplm_python_path",
default='/mnt/gna0/rsennrich/tools/nplm/python')
parser.add_argument(
"-i", "--input-model", type=str, dest="input_model", required=True)
parser.add_argument(
"-o", "--output-model", type=str, dest="output_model", required=True)
parser.add_argument(
"-n", "--null-token-index", type=int, dest="null_idx", default=-1)
parser.add_argument(
"-t", "--training-ngrams", type=str, dest="training_ngrams",
required=True)
options = parser.parse_args()
sys.path.append(options.nplm_python_path)
import nplm
from collections import defaultdict
def load_model(model_file):
return nplm.NeuralLM.from_file(model_file)
def get_weights(path, length):
counter = [0]*length
counter = [0] * length
for line in open(path):
last_context = int(line.split()[-2])
counter[last_context] += 1
return counter
if __name__ == "__main__":
model = load_model(options.input_model)
if options.null_idx == -1:
options.null_idx = model.word_to_index_input['<null>']
options.null_idx = model.word_to_index_input['<null>']
sys.stderr.write('index of <null>: {0}\n'.format(options.null_idx))
weights = numpy.array(get_weights(options.training_ngrams, len(model.input_embeddings)))
model.input_embeddings[options.null_idx] = numpy.average(numpy.array(model.input_embeddings), weights=weights, axis=0)
model.to_file(open(options.output_model,'w'))
weights = numpy.array(
get_weights(options.training_ngrams, len(model.input_embeddings)))
model.input_embeddings[options.null_idx] = numpy.average(
numpy.array(model.input_embeddings), weights=weights, axis=0)
model.to_file(open(options.output_model, 'w'))

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python
from collections import Counter
import heapq
import logging
import optparse
import sys
LOG = logging.getLogger(__name__)
@ -12,26 +10,28 @@ BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"
def replace_tags(tokens,tags,vocab):
for i,t in enumerate(tokens):
if not t in vocab:
if i < len(tags):
tokens[i] = tags[i]
else:
print "Error: missing tags for index i:", i
print ' '.join(tokens)
print ' '.join(tags)
tokens[i] = UNK
def replace_unks(tokens,vocab):
for i,t in enumerate(tokens):
if not t in vocab:
tokens[i] = UNK
def replace_tags(tokens, tags, vocab):
for i, t in enumerate(tokens):
if t not in vocab:
if i < len(tags):
tokens[i] = tags[i]
else:
print "Error: missing tags for index i:", i
print ' '.join(tokens)
print ' '.join(tags)
tokens[i] = UNK
def replace_unks(tokens, vocab):
for i, t in enumerate(tokens):
if t not in vocab:
tokens[i] = UNK
def numberize(line, m, n, svocab, tvocab):
line = line.split()
source_words = line[:2*m + 1]
source_words = line[:2 * m + 1]
target_words = line[-n:]
line = ' '.join([str(svocab[item]) for item in source_words]) + ' '
@ -40,7 +40,8 @@ def numberize(line, m, n, svocab, tvocab):
return line
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,
tlang, m, n, ofh):
"""
m - source context
n - target context
@ -51,83 +52,87 @@ def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang
sfh = open(corpus_stem + "." + slang)
tfh = open(corpus_stem + "." + tlang)
afh = open(align_file)
fhs = [sfh,tfh,afh]
fhs = [sfh, tfh, afh]
if tagged_stem:
fhs.append(open(tagged_stem + "." + slang))
fhs.append(open(tagged_stem + "." + tlang))
fhs.append(open(tagged_stem + "." + slang))
fhs.append(open(tagged_stem + "." + tlang))
count = 0
count = 0
ngrams = 0
LOG.info("Extracting ngrams")
for lines in zip(*fhs):
stokens = lines[0][:-1].split()
ttokens = lines[1][:-1].split()
stokens.append(EOS)
ttokens.append(EOS)
if tagged_stem:
stags = lines[3][:-1].split()
ttags = lines[4][:-1].split()
stags.append(EOS)
ttags.append(EOS)
tags.update(stags)
tags.update(ttags)
replace_tags(stokens,stags,svocab)
replace_tags(ttokens,ttags,tvocab)
else:
replace_unks(stokens,svocab)
replace_unks(ttokens,tvocab)
# list aligns for each target
# Note: align specifies source -> target
target_aligns = [[] for t in range(len(ttokens))]
for atoken in lines[2][:-1].split():
spos,tpos = atoken.split("-")
spos,tpos = int(spos), int(tpos)
target_aligns[tpos].append(spos)
#EOS alignment
target_aligns[-1] = [len(stokens)-1]
for lines in zip(*fhs):
stokens = lines[0][:-1].split()
ttokens = lines[1][:-1].split()
stokens.append(EOS)
ttokens.append(EOS)
if tagged_stem:
stags = lines[3][:-1].split()
ttags = lines[4][:-1].split()
stags.append(EOS)
ttags.append(EOS)
tags.update(stags)
tags.update(ttags)
replace_tags(stokens, stags, svocab)
replace_tags(ttokens, ttags, tvocab)
else:
replace_unks(stokens, svocab)
replace_unks(ttokens, tvocab)
# List aligns for each target.
# Note: align specifies source -> target
target_aligns = [[] for t in range(len(ttokens))]
for atoken in lines[2][:-1].split():
spos, tpos = atoken.split("-")
spos, tpos = int(spos), int(tpos)
target_aligns[tpos].append(spos)
for tpos,spos_list in enumerate(target_aligns):
# Affiliation heuristics - see Devlin t al. p1371
if not spos_list:
#tpos has no alignment, look right, then left, then right-right, then left-left etc
rpos = tpos+1
lpos = tpos-1
while rpos < len(ttokens) or lpos >= 0:
if rpos < len(ttokens) and target_aligns[rpos]:
spos_list = target_aligns[rpos]
break
if lpos >= 0 and target_aligns[lpos]:
spos_list = target_aligns[lpos]
break
rpos += 1
lpos -= 1
# EOS alignment.
target_aligns[-1] = [len(stokens) - 1]
if not spos_list:
raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
midpos = (len(spos_list)-1) / 2
spos = sorted(spos_list)[midpos]
for tpos, spos_list in enumerate(target_aligns):
# Affiliation heuristics - see Devlin t al. p1371
if not spos_list:
# tpos has no alignment, look right, then left, then
# right-right, then left-left etc.
rpos = tpos + 1
lpos = tpos - 1
while rpos < len(ttokens) or lpos >= 0:
if rpos < len(ttokens) and target_aligns[rpos]:
spos_list = target_aligns[rpos]
break
if lpos >= 0 and target_aligns[lpos]:
spos_list = target_aligns[lpos]
break
rpos += 1
lpos -= 1
if not spos_list:
raise Exception(
"No alignments in sentence \nSRC: " +
lines[0][:-1] + "\nTGT: " + lines[1][:-1])
midpos = (len(spos_list) - 1) / 2
spos = sorted(spos_list)[midpos]
# source-context, target-context, predicted word
for i in range(max(0,m-spos)):
print>>ofh, BOS,
#print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
for i in range(max(0,spos+m+1-len(stokens))):
print>>ofh, EOS,
for i in range(max(0,n-(tpos+1))):
print>>ofh, BOS,
print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
print>>ofh
ngrams += 1
# source-context, target-context, predicted word
for i in range(max(0, m - spos)):
print>>ofh, BOS,
# print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
print>>ofh, " ".join(
[s for s in stokens[max(0, spos - m):spos + m + 1]]),
for i in range(max(0, spos + m + 1 - len(stokens))):
print>>ofh, EOS,
for i in range(max(0, n - (tpos + 1))):
print>>ofh, BOS,
print>>ofh, " ".join(
[t for t in ttokens[max(0, tpos + 1 - n):tpos + 1]]),
print>>ofh
ngrams += 1
count += 1
if count % 1000 == 0: sys.stderr.write(".")
if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
count += 1
if count % 1000 == 0:
sys.stderr.write(".")
if count % 50000 == 0:
sys.stderr.write(" [%d]\n" % count)
ofh.close()
sys.stderr.write("\n")
LOG.info("Extracted %d ngrams" % ngrams)
return tags

View File

@ -1,8 +1,7 @@
#!/usr/bin/env python
#
# Create a test corpus, using a previously pruned vocabulary.
#
"""Create a test corpus, using a previously pruned vocabulary."""
import logging
import optparse
@ -12,72 +11,84 @@ import sys
import extract
def read_vocab(filename, offset=0):
vocab = {}
for i, line in enumerate(open(filename)):
vocab[line.strip()] = i+offset
return vocab, i+offset
vocab = {}
for i, line in enumerate(open(filename)):
vocab[line.strip()] = i + offset
return vocab, i + offset
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-e", "--target-language", type="string", dest="target_language")
parser.add_option("-f", "--source-language", type="string", dest="source_language")
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option("-a", "--align", type="string", dest="align_file")
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option(
"-e", "--target-language", type="string", dest="target_language")
parser.add_option(
"-f", "--source-language", type="string", dest="source_language")
parser.add_option(
"-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option(
"-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option(
"-a", "--align", type="string", dest="align_file")
parser.add_option(
"-w", "--working-dir", type="string", dest="working_dir")
parser.set_defaults(
target_language="en",
source_language="de",
corpus_stem="test",
align_file="test.align",
working_dir="working")
options, args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
raise Exception(
"Working directory '%s' not found" % options.working_dir)
parser.set_defaults(
target_language = "en",
source_language = "de",
corpus_stem = "test",
align_file = "test.align",
working_dir = "working",
)
options,args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
LOG.error("Working directory '%s' not found" % working_dir)
sys.exit(1)
m, n = None, None
for line in open(options.working_dir + "/info"):
name, value = line[:-1].split()
if name == "m":
m = int(value)
if name == "n":
n = int(value)
if m is None or n is None:
raise Exception("Info file is incomplete.")
m,n = None,None
for line in open(options.working_dir + "/info"):
name,value = line[:-1].split()
if name == "m": m = int(value)
if name == "n": n = int(value)
if m == None or n == None:
LOG.error("info file is incomplete")
sys.exit(1)
tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
svocab, offset = read_vocab(
options.working_dir + "/vocab.source", offset + 1)
tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset+1)
file_stem = os.path.basename(options.corpus_stem)
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
extract.get_ngrams(
options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
m,
n,
ofh)
file_stem = os.path.basename(options.corpus_stem)
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
extract.get_ngrams(options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
m,
n,
ofh)
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
ngrams_file_handle = open(
os.path.join(options.working_dir, file_stem + ".ngrams"), 'r')
numberized_file_handle = open(numberized_file, 'w')
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
ngrams_file_handle = open(options.working_dir + "/" + file_stem + ".ngrams", 'r')
numberized_file_handle = open(numberized_file, 'w')
#Numberize the file
for line in ngrams_file_handle:
numberized_file_handle.write(extract.numberize(line, m, n, svocab, tvocab))
numberized_file_handle.close()
ngrams_file_handle.close()
# Numberize the file.
for line in ngrams_file_handle:
numberized_file_handle.write(extract.numberize(
line, m, n, svocab, tvocab))
numberized_file_handle.close()
ngrams_file_handle.close()
if __name__ == "__main__":
main()
main()

View File

@ -11,145 +11,160 @@ import extract
LOG = logging.getLogger(__name__)
def get_pruned_vocab(corpus,prune):
counts = Counter()
LOG.info("Reading vocabulary from %s" % corpus)
lines = 0
for line in open(corpus):
for token in line[:-1].split():
counts[token] += 1
lines += 1
if lines % 1000 == 0: sys.stderr.write(".")
if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
sys.stderr.write("\n")
counts[extract.BOS] += lines
counts[extract.EOS] += lines
LOG.info("Vocabulary size: %d" % len(counts))
if prune:
return Counter(dict(counts.most_common(prune)))
else:
return counts
def get_pruned_vocab(corpus, prune):
counts = Counter()
LOG.info("Reading vocabulary from %s" % corpus)
lines = 0
for line in open(corpus):
for token in line[:-1].split():
counts[token] += 1
lines += 1
if lines % 1000 == 0:
sys.stderr.write(".")
if lines % 50000 == 0:
sys.stderr.write(" [%d]\n" % lines)
sys.stderr.write("\n")
counts[extract.BOS] += lines
counts[extract.EOS] += lines
LOG.info("Vocabulary size: %d" % len(counts))
if prune:
return Counter(dict(counts.most_common(prune)))
else:
return counts
def save_vocab(directory, filename, vocab):
fh = open(directory + "/" + filename, "w")
for word in vocab:
print>>fh, word
fh = open(directory + "/" + filename, "w")
for word in vocab:
print>>fh, word
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-e", "--target-language", type="string", dest="target_language")
parser.add_option("-f", "--source-language", type="string", dest="source_language")
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option("-a", "--align", type="string", dest="align_file")
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
parser.add_option("-n", "--target-context", type="int", dest="n")
parser.add_option("-m", "--source-context", type="int", dest="m")
parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option(
"-e", "--target-language", type="string", dest="target_language")
parser.add_option(
"-f", "--source-language", type="string", dest="source_language")
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option(
"-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option("-a", "--align", type="string", dest="align_file")
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
parser.add_option("-n", "--target-context", type="int", dest="n")
parser.add_option("-m", "--source-context", type="int", dest="m")
parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
parser.set_defaults(
target_language="en",
source_language="de",
corpus_stem="train.10k",
align_file="train.10k.align",
n=5,
m=4,
working_dir="working",
sprune=16000,
tprune=16000
)
options, args = parser.parse_args(sys.argv)
parser.set_defaults(
target_language = "en",
source_language = "de",
corpus_stem = "train.10k",
align_file = "train.10k.align",
n = 5,
m = 4,
working_dir = "working",
sprune=16000,
tprune=16000
)
options,args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
else:
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
else:
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
info_file = options.working_dir + "/info"
if os.path.exists(info_file):
for line in open(info_file):
name, value = line[:-1].split()
n_mismatch = (name == 'n' and int(value) != options.n)
m_mismatch = (name == 'm' and int(value) != options.m)
if n_mismatch or m_mismatch:
LOG.error(
"info file exists, but parameters do not match. "
"Delete working directory and rerun.")
sys.exit(1)
else:
ifh = open(info_file, "w")
print>>ifh, "m", options.m
print>>ifh, "n", options.n
ifh.close()
info_file = options.working_dir + "/info"
if os.path.exists(info_file):
for line in open(info_file):
name,value = line[:-1].split()
if name == "n" and int(value) != options.n or \
name == "m" and int(value) != options.m:
LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
sys.exit(1)
else:
ifh = open(info_file,"w")
print>>ifh,"m",options.m
print>>ifh,"n",options.n
ifh.close()
scorpus = options.corpus_stem + "." + options.source_language
tcorpus = options.corpus_stem + "." + options.target_language
scorpus = options.corpus_stem + "." + options.source_language
tcorpus = options.corpus_stem + "." + options.target_language
tvocab, svocab = None, None
# Extract vocabulary, and prune, if required.
svocab = get_pruned_vocab(scorpus, options.sprune)
tvocab = get_pruned_vocab(tcorpus, options.tprune)
tvocab,svocab = None,None
# Extract vocabulary, and prune, if required
svocab = get_pruned_vocab(scorpus,options.sprune)
tvocab = get_pruned_vocab(tcorpus,options.tprune)
file_stem = os.path.basename(options.corpus_stem)
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
ofh = open(ngram_file, "w")
tags = extract.get_ngrams(
options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
options.m,
options.n,
ofh)
file_stem = os.path.basename(options.corpus_stem)
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
ofh = open(ngram_file, "w")
tags = extract.get_ngrams(options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
options.m,
options.n,
ofh)
# Save vocabularies.
del svocab["<null>"]
del tvocab["<null>"]
del svocab["<unk>"]
del tvocab["<unk>"]
svocab_list = [item[0] for item in svocab.most_common()]
tvocab_list = [item[0] for item in tvocab.most_common()]
# Save vocabularies
del svocab["<null>"]
del tvocab["<null>"]
del svocab["<unk>"]
del tvocab["<unk>"]
svocab_list = [item[0] for item in svocab.most_common()]
tvocab_list = [item[0] for item in tvocab.most_common()]
# UNK is always the first vocabulary element. Make sure
# it appears in position 0
# We need to use <null> token in the chart decoder in order
# to correctly estimate the probabilities of incomplete subphrases
# that are not sentence initial.
# UNK is always the first vocabulary element. Make sure
# it appears in position 0
# We need to use <null> token in the chart decoder in order
# to correctly estimate the probabilities of incomplete subphrases
# that are not sentence initial.
tvocab_list.insert(0, "<null>")
tvocab_list.insert(0, "<unk>")
svocab_list.insert(0, "<unk>")
tvocab_list.insert(0, "<null>")
tvocab_list.insert(0, "<unk>")
svocab_list.insert(0, "<unk>")
# Get tags:
tag_list = [item[0] for item in tags.most_common()]
svocab_list = svocab_list + tag_list
tvocab_list = tvocab_list + tag_list
#Get tags:
tag_list = [item[0] for item in tags.most_common()]
svocab_list = svocab_list + tag_list
tvocab_list = tvocab_list + tag_list
save_vocab(options.working_dir, "vocab.source", svocab_list)
save_vocab(options.working_dir, "vocab.target", tvocab_list)
save_vocab(options.working_dir, "vocab.source", svocab_list)
save_vocab(options.working_dir, "vocab.target", tvocab_list)
# Create vocab dictionaries that map word to ID.
tvocab_idmap = {}
for i in range(len(tvocab_list)):
tvocab_idmap[tvocab_list[i]] = i
#Create vocab dictionaries that map word to ID
tvocab_idmap = {}
for i in range(len(tvocab_list)):
tvocab_idmap[tvocab_list[i]] = i
svocab_idmap = {}
for i in range(len(svocab_list)):
svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
svocab_idmap = {}
for i in range(len(svocab_list)):
svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
ngrams_file_handle = open(ngram_file, 'r')
numberized_file_handle = open(numberized_file, 'w')
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
ngrams_file_handle = open(ngram_file, 'r')
numberized_file_handle = open(numberized_file, 'w')
# Numberize the file.
for line in ngrams_file_handle:
numberized_file_handle.write(
extract.numberize(
line, options.m, options.n, svocab_idmap, tvocab_idmap))
numberized_file_handle.close()
ngrams_file_handle.close()
#Numberize the file
for line in ngrams_file_handle:
numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap))
numberized_file_handle.close()
ngrams_file_handle.close()
if __name__ == "__main__":
main()
main()

View File

@ -1,6 +1,10 @@
#!/usr/bin/env python3
"""Reduces an ngrams file for training nplm to a smaller version of it with less ngrams"""
"""Reduces an ngrams file for training nplm to a smaller version of it.
The smaller version will have fewer ngrams.
"""
from sys import argv
if len(argv) != 5:
@ -15,11 +19,11 @@ NGRAMS = int(argv[4])
for line in INFILE:
line = line.split()
line = line[START_IDX:START_IDX+NGRAMS]
line = line[START_IDX:START_IDX + NGRAMS]
linetowrite = ""
for token in line:
linetowrite = linetowrite + token + " "
#Strip final empty space and add newline
# Strip final empty space and add newline.
linetowrite = linetowrite[:-1]
linetowrite = linetowrite + '\n'
OUTFILE.write(linetowrite)

View File

@ -7,51 +7,71 @@ import sys
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-w", "--working-dir", dest="working_dir")
parser.add_option("-c", "--corpus", dest="corpus_stem")
parser.add_option("-r", "--train-corpus", dest="train_stem")
parser.add_option("-l", "--nplm-home", dest="nplm_home")
parser.add_option("-e", "--epoch", dest="epoch", type="int")
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
parser.add_option("-t", "--threads", dest="threads", type="int")
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-w", "--working-dir", dest="working_dir")
parser.add_option("-c", "--corpus", dest="corpus_stem")
parser.add_option("-r", "--train-corpus", dest="train_stem")
parser.add_option("-l", "--nplm-home", dest="nplm_home")
parser.add_option("-e", "--epoch", dest="epoch", type="int")
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
parser.add_option(
"-b", "--minibatch-size", dest="minibatch_size", type="int")
parser.add_option("-t", "--threads", dest="threads", type="int")
parser.set_defaults(
working_dir = "working"
,corpus_stem = "test"
,train_stem = "train.10k"
,nplm_home = "/home/bhaddow/tools/nplm"
,epoch=10
,ngram_size = 14
,minibatch_size=1000
,threads=8
)
parser.set_defaults(
working_dir="working",
corpus_stem="test",
train_stem="train.10k",
nplm_home="/home/bhaddow/tools/nplm",
epoch=10,
ngram_size=14,
minibatch_size=1000,
threads=8)
options,args = parser.parse_args(sys.argv)
options, _ = parser.parse_args(sys.argv)
model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
model_file = model_prefix + "." + str(options.epoch)
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
vocab_file = options.working_dir + "/vocab"
model_prefix = (
options.working_dir + "/" + options.train_stem + ".model.nplm")
model_file = model_prefix + "." + str(options.epoch)
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
vocab_file = options.working_dir + "/vocab"
#TODO: Get ngram size from info file.
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
ret = subprocess.call(prep_args)
if ret: raise Exception("Preparation failed")
# TODO: Get ngram size from info file.
prep_args = [
options.nplm_home + "/src/prepareNeuralLM",
"--train_text", test_file,
"--ngram_size", str(options.ngram_size),
"--ngramize", "0",
"--words_file", vocab_file,
"--train_file", prep_file,
]
ret = subprocess.call(prep_args)
if ret:
raise Exception("Preparation failed")
test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
ret = subprocess.call(test_args)
if ret: raise Exception("Testing failed")
test_args = [
options.nplm_home + "/src/testNeuralNetwork",
"--test_file", prep_file,
"--model_file", model_file,
"--minibatch_size", str(options.minibatch_size),
"--num_threads", str(options.threads),
]
ret = subprocess.call(test_args)
if ret:
raise Exception("Testing failed")
#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
# $ROOT/src/prepareNeuralLM --train_text $TESTFILE1 \
# --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE \
# --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
# $ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams \
# --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE \
# --num_threads $THREADS || exit 1
#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
if __name__ == "__main__":
main()
main()

View File

@ -8,7 +8,9 @@ import subprocess
import sys
import os
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-c", "--corpus", dest="corpus_stem")
@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
parser.add_argument("-s", "--noise", dest="noise", type=int)
parser.add_argument("-d", "--hidden", dest="hidden", type=int)
parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
parser.add_argument(
"-i", "--input-embedding", dest="input_embedding", type=int)
parser.add_argument(
"-o", "--output-embedding", dest="output_embedding", type=int)
parser.add_argument("-t", "--threads", dest="threads", type=int)
parser.add_argument("-m", "--output-model", dest="output_model")
parser.add_argument("-r", "--output-dir", dest="output_dir")
@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
parser.set_defaults(
working_dir = "working"
,corpus_stem = "train.10k"
,nplm_home = "/home/bhaddow/tools/nplm"
,epochs = 10
,ngram_size = 14
,minibatch_size=1000
,noise=100
,hidden=750
,input_embedding=150
,output_embedding=150
,threads=1
,output_model = "train.10k"
,output_dir = None
,config_options_file = "config"
,log_file = "log"
,validation_file = None
,activation_fn = "rectifier"
,learning_rate = 1
,input_words_file = None
,output_words_file = None
,input_vocab_size = 0
,output_vocab_size = 0
working_dir="working",
corpus_stem="train.10k",
nplm_home="/home/bhaddow/tools/nplm",
epochs=10,
ngram_size=14,
minibatch_size=1000,
noise=100,
hidden=750,
input_embedding=150,
output_embedding=150,
threads=1,
output_model="train.10k",
output_dir=None,
config_options_file="config",
log_file="log",
validation_file=None,
activation_fn="rectifier",
learning_rate=1,
input_words_file=None,
output_words_file=None,
input_vocab_size=0,
output_vocab_size=0
)
def main(options):
vocab_command = []
if options.input_words_file is not None:
vocab_command += ['--input_words_file', options.input_words_file]
if options.output_words_file is not None:
vocab_command += ['--output_words_file', options.output_words_file]
if options.input_vocab_size:
vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
if options.output_vocab_size:
vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
vocab_command = []
if options.input_words_file is not None:
vocab_command += ['--input_words_file', options.input_words_file]
if options.output_words_file is not None:
vocab_command += ['--output_words_file', options.output_words_file]
if options.input_vocab_size:
vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
if options.output_vocab_size:
vocab_command += [
'--output_vocab_size', str(options.output_vocab_size)]
# Set up validation command variable to use with validation set.
validations_command = []
if options.validation_file is not None:
validations_command =["--validation_file", (options.validation_file + ".numberized")]
# Set up validation command variable to use with validation set.
validations_command = []
if options.validation_file is not None:
validations_command = [
"--validation_file", (options.validation_file + ".numberized")]
# In order to allow for different models to be trained after the same
# preparation step, we should provide an option for multiple output directories
# If we have not set output_dir, set it to the same thing as the working dir
# In order to allow for different models to be trained after the same
# preparation step, we should provide an option for multiple output
# directories.
# If we have not set output_dir, set it to the same thing as the working
# dir.
if options.output_dir is None:
options.output_dir = options.working_dir
else:
# Create output dir if necessary
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
if options.output_dir is None:
options.output_dir = options.working_dir
else:
# Create output dir if necessary
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
log_file_write = open(log_file, 'w')
config_file_write = open(config_file, 'w')
config_file = os.path.join(
options.output_dir,
options.config_options_file + '-' + options.output_model)
log_file = os.path.join(
options.output_dir, options.log_file + '-' + options.output_model)
log_file_write = open(log_file, 'w')
config_file_write = open(config_file, 'w')
config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
in_file = os.path.join(
options.working_dir,
os.path.basename(options.corpus_stem) + ".numberized")
model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
train_args = [options.nplm_home + "/src/trainNeuralNetwork",
"--train_file", in_file,
"--num_epochs", str(options.epochs),
"--model_prefix", model_prefix,
"--learning_rate", str(options.learning_rate),
"--minibatch_size", str(options.minibatch_size),
"--num_noise_samples", str(options.noise),
"--num_hidden", str(options.hidden),
"--input_embedding_dimension", str(options.input_embedding),
"--output_embedding_dimension", str(options.output_embedding),
"--num_threads", str(options.threads),
"--activation_function", options.activation_fn] + validations_command + vocab_command
print("Train model command: ")
print(', '.join(train_args))
model_prefix = os.path.join(
options.output_dir, options.output_model + ".model.nplm")
train_args = [
options.nplm_home + "/src/trainNeuralNetwork",
"--train_file", in_file,
"--num_epochs", str(options.epochs),
"--model_prefix", model_prefix,
"--learning_rate", str(options.learning_rate),
"--minibatch_size", str(options.minibatch_size),
"--num_noise_samples", str(options.noise),
"--num_hidden", str(options.hidden),
"--input_embedding_dimension", str(options.input_embedding),
"--output_embedding_dimension", str(options.output_embedding),
"--num_threads", str(options.threads),
"--activation_function",
options.activation_fn,
] + validations_command + vocab_command
print("Train model command: ")
print(', '.join(train_args))
config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
config_file_write.close()
config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
config_file_write.close()
log_file_write.write("Training output:\n")
ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
if ret:
raise Exception("Training failed")
log_file_write.write("Training output:\n")
ret = subprocess.call(
train_args, stdout=log_file_write, stderr=log_file_write)
if ret:
raise Exception("Training failed")
log_file_write.close()
log_file_write.close()
if __name__ == "__main__":
options = parser.parse_args()
main(options)
options = parser.parse_args()
main(options)

View File

@ -2,15 +2,27 @@
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
# This script creates tables that store phrase pair frequencies rather than probabilities.
# These count tables can be used for a delayed, online computation of the original phrase translation features
# The benefit is that models can be combined quickly, with the same results as if we trained a model on the concatenation of all data (excepting differences in word alignment).
# Also, each model can be given a weight, which is applied to all frequencies of the model for the combination.
# This script creates tables that store phrase pair frequencies rather than
# probabilities.
#
# These count tables can be used for a delayed, online computation of the
# original phrase translation features.
#
# The benefit is that models can be combined quickly, with the same results
# as if we trained a model on the concatenation of all data (excepting
# differences in word alignment).
#
# Also, each model can be given a weight, which is applied to all frequencies
# of the model for the combination.
# Note: the input phrase table must have alignment information;
# it must be unsmoothed;
# additionally, the phrase table type PhraseDictionaryMultiModelCounts requires the lexical counts files lex.counts.e2f and lex.counts.f2e (obtained by using the option --write-lexical-counts in train-model.perl)
# The results may differ from training on the concatenation of all data due to differences in word alignment, and rounding errors.
# additionally, the phrase table type PhraseDictionaryMultiModelCounts
# requires the lexical counts files lex.counts.e2f and lex.counts.f2e
# (obtained by using the option --write-lexical-counts in
# train-model.perl)
# The results may differ from training on the concatenation of all data due
# to differences in word alignment, and rounding errors.
from __future__ import unicode_literals
@ -21,11 +33,15 @@ from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
if len(sys.argv) < 3 or len(sys.argv) > 4:
sys.stderr.write('Usage: ' + sys.argv[0] + ' in_file out_path [prune_count]\nThis script will create the files out_path/count-table.gz and out_path/count-table-target.gz\n')
sys.stderr.write(
'Usage: ' +
sys.argv[0] + " in_file out_path [prune_count]\n"
"This script will create the files out_path/count-table.gz and "
"out_path/count-table-target.gz\n")
exit()
def handle_file(filename,action,fileobj=None,mode='r'):
def handle_file(filename, action, fileobj=None, mode='r'):
"""support reading either from stdin, plain file or gzipped file"""
if action == 'open':
@ -33,21 +49,23 @@ def handle_file(filename,action,fileobj=None,mode='r'):
if mode == 'r':
mode = 'rb'
if mode == 'rb' and not filename == '-' and not os.path.exists(filename):
if os.path.exists(filename+'.gz'):
filename = filename+'.gz'
if mode == 'rb' and filename != '-' and not os.path.exists(filename):
if os.path.exists(filename + '.gz'):
filename = filename + '.gz'
else:
sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n')
sys.stderr.write(
"Error: unable to open file. " +
filename + " - aborting.\n")
exit()
if filename.endswith('.gz'):
fileobj = gzip.open(filename,mode)
fileobj = gzip.open(filename, mode)
elif filename == '-':
fileobj = sys.stdin
else:
fileobj = open(filename,mode)
fileobj = open(filename, mode)
return fileobj
@ -59,10 +77,13 @@ def sort_and_uniq(infile, outfile):
cmd = ['sort', infile]
fobj = handle_file(outfile, 'open', mode='w')
sys.stderr.write('Executing: LC_ALL=C ' + ' '.join(cmd) + ' | uniq | gzip -c > ' + outfile + '\n')
p_sort = Popen(cmd, env={'LC_ALL':'C'}, stdout=PIPE)
p_uniq = Popen(['uniq'], stdin = p_sort.stdout, stdout=PIPE)
p_compress = Popen(['gzip', '-c'], stdin = p_uniq.stdout, stdout=fobj)
sys.stderr.write(
"Executing: LC_ALL=C " +
' '.join(cmd) +
' | uniq | gzip -c > ' + outfile + '\n')
p_sort = Popen(cmd, env={'LC_ALL': 'C'}, stdout=PIPE)
p_uniq = Popen(['uniq'], stdin=p_sort.stdout, stdout=PIPE)
p_compress = Popen(['gzip', '-c'], stdin=p_uniq.stdout, stdout=fobj)
p_compress.wait()
fobj.close()
@ -89,9 +110,9 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
try:
fst = comments[2]
except IndexError:
fst = str(int(round(float(scores[0])*float(ft)))).encode()
fst = str(int(round(float(scores[0]) * float(ft)))).encode()
line[2] = b' '.join([fst,ft,fs])
line[2] = b' '.join([fst, ft, fs])
if prune:
if current_source != source:
@ -106,8 +127,10 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
else:
countobj.write(b' ||| '.join(line))
# target count file
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n' # if you use string formatting to make this look nicer, you may break Python 3 compatibility.
# Target count file.
# If you use string formatting to make this look nicer, you may break
# Python 3 compatibility.
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n'
countobj_target.write(tline)
if prune:
@ -119,7 +142,8 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
def write_batch(store_lines, outfile, prune):
top20 = sorted(store_lines, reverse=True)[:prune]
for score, original_pos, store_line in sorted(top20, key = lambda x: x[1]): #write in original_order
# Write in original_order.
for score, original_pos, store_line in sorted(top20, key=lambda x: x[1]):
outfile.write(store_line)
@ -130,21 +154,28 @@ if __name__ == '__main__':
else:
prune = 0
fileobj = handle_file(sys.argv[1],'open')
fileobj = handle_file(sys.argv[1], 'open')
out_path = sys.argv[2]
count_table_file = gzip.open(os.path.join(out_path,'count-table.gz'), 'w')
count_table_target_file = os.path.join(out_path,'count-table-target.gz')
count_table_file = gzip.open(
os.path.join(out_path, 'count-table.gz'), 'w')
count_table_target_file = os.path.join(out_path, 'count-table-target.gz')
count_table_target_file_temp = NamedTemporaryFile(delete=False)
try:
sys.stderr.write('Creating temporary file for unsorted target counts file: ' + count_table_target_file_temp.name + '\n')
sys.stderr.write(
"Creating temporary file for unsorted target counts file: " +
count_table_target_file_temp.name + '\n')
create_count_lines(fileobj, count_table_file, count_table_target_file_temp, prune)
create_count_lines(
fileobj, count_table_file, count_table_target_file_temp, prune)
count_table_target_file_temp.close()
sys.stderr.write('Finished writing, now re-sorting and compressing target count file\n')
sys.stderr.write(
"Finished writing, "
"now re-sorting and compressing target count file.\n")
sort_and_uniq(count_table_target_file_temp.name, count_table_target_file)
sort_and_uniq(
count_table_target_file_temp. name, count_table_target_file)
os.remove(count_table_target_file_temp.name)
sys.stderr.write('Done\n')

View File

@ -1,10 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# add flexibility scores to a phrase table half
# you usually don't have to call this script directly; to add flexibility scores to your model, run train-model.perl with the option "--flexibility-score" (will only affect steps 5 and 6)
# usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file
# author: Rico Sennrich
"""Add flexibility scores to a phrase table half.
You usually don't have to call this script directly; to add flexibility
scores to your model, run train-model.perl with the option
"--flexibility-score" (will only affect steps 5 and 6).
Usage:
python flexibility_score.py extract.context(.inv).sorted \
[--Inverse] [--Hierarchical] < phrasetable > output_file
"""
from __future__ import division
from __future__ import unicode_literals
@ -12,26 +21,28 @@ import sys
import gzip
from collections import defaultdict
class FlexScore:
def __init__(self, inverted, hierarchical):
self.inverted = inverted
self.hierarchical = hierarchical
def store_pt(self, obj):
"""Store line in dictionary.
def store_pt(self,obj):
"""store line in dictionary; if we work with inverted phrase table, swap the two phrases"""
src,target = obj[0],obj[1]
If we work with inverted phrase table, swap the two phrases.
"""
src, target = obj[0], obj[1]
if self.inverted:
src, target = target, src
self.phrase_pairs[src][target] = obj
def update_contextcounts(self, obj):
"""count the number of contexts a phrase pair occurs in"""
src,target = obj[0],obj[1]
src, target = obj[0], obj[1]
self.context_counts[src][target] += 1
if obj[-1].startswith(b'<'):
self.context_counts_l[src][target] += 1
@ -40,18 +51,21 @@ class FlexScore:
elif obj[-1].startswith(b'v'):
self.context_counts_d[src][target] += 1
else:
sys.stderr.write(b'\nERROR in line: {0}\n'.format(b' ||| '.join(obj)))
sys.stderr.write(b'ERROR: expecting one of \'<, >, v\' as context marker in context extract file\n')
sys.stderr.write(
b"\nERROR in line: {0}\n".format(b' ||| '.join(obj)))
sys.stderr.write(
b"ERROR: expecting one of '<, >, v' as context marker "
"in context extract file.\n")
raise ValueError
def traverse_incrementally(self,phrasetable,flexfile):
"""traverse phrase table and phrase extract file (with context information) incrementally
without storing all in memory."""
def traverse_incrementally(self, phrasetable, flexfile):
"""Traverse phrase table and phrase extract file (with context
information) incrementally without storing all in memory.
"""
increment = b''
old_increment = 1
stack = ['']*2
stack = [''] * 2
# which phrase to use for sorting
sort_pt = 0
@ -63,10 +77,10 @@ class FlexScore:
old_increment = increment
self.phrase_pairs = defaultdict(dict)
self.context_counts = defaultdict(lambda:defaultdict(int))
self.context_counts_l = defaultdict(lambda:defaultdict(int))
self.context_counts_r = defaultdict(lambda:defaultdict(int))
self.context_counts_d = defaultdict(lambda:defaultdict(int))
self.context_counts = defaultdict(lambda: defaultdict(int))
self.context_counts_l = defaultdict(lambda: defaultdict(int))
self.context_counts_r = defaultdict(lambda: defaultdict(int))
self.context_counts_d = defaultdict(lambda: defaultdict(int))
if stack[0]:
self.store_pt(stack[0])
@ -96,30 +110,32 @@ class FlexScore:
yield 1
def main(self,phrasetable,flexfile,output_object):
def main(self, phrasetable, flexfile, output_object):
i = 0
sys.stderr.write('Incrementally loading phrase table and adding flexibility score...')
for block in self.traverse_incrementally(phrasetable,flexfile):
sys.stderr.write(
"Incrementally loading phrase table "
"and adding flexibility score...")
for block in self.traverse_incrementally(phrasetable, flexfile):
self.flexprob_l = normalize(self.context_counts_l)
self.flexprob_r = normalize(self.context_counts_r)
self.flexprob_d = normalize(self.context_counts_d)
for src in sorted(self.phrase_pairs, key = lambda x: x + b' |'):
for target in sorted(self.phrase_pairs[src], key = lambda x: x + b' |'):
# TODO: Why this lambda? It doesn't affect sorting, does it?
sortkey = lambda x: x + b' |'
for src in sorted(self.phrase_pairs, key=sortkey):
for target in sorted(self.phrase_pairs[src], key=sortkey):
if not i % 1000000:
if i % 1000000 == 0:
sys.stderr.write('.')
i += 1
outline = self.write_phrase_table(src,target)
outline = self.write_phrase_table(src, target)
output_object.write(outline)
sys.stderr.write('done\n')
def write_phrase_table(self,src,target):
def write_phrase_table(self, src, target):
line = self.phrase_pairs[src][target]
flexscore_l = b"{0:.6g}".format(self.flexprob_l[src][target])
@ -136,7 +152,6 @@ class FlexScore:
return b' ||| '.join(line) + b'\n'
def normalize(d):
out_dict = defaultdict(dict)
@ -145,7 +160,7 @@ def normalize(d):
total = sum(d[src].values())
for target in d[src]:
out_dict[src][target] = d[src][target]/total
out_dict[src][target] = d[src][target] / total
return out_dict
@ -153,7 +168,10 @@ def normalize(d):
if __name__ == '__main__':
if len(sys.argv) < 1:
sys.stderr.write('Usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file\n')
sys.stderr.write(
"Usage: "
"python flexibility_score.py extract.context(.inv).sorted "
"[--Inverse] [--Hierarchical] < phrasetable > output_file\n")
exit()
flexfile = sys.argv[1]
@ -168,4 +186,4 @@ if __name__ == '__main__':
hierarchical = False
FS = FlexScore(inverted, hierarchical)
FS.main(sys.stdin,gzip.open(flexfile,'r'),sys.stdout)
FS.main(sys.stdin, gzip.open(flexfile, 'r'), sys.stdout)