mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Fix some python lint.
I used mainly pocketlint, a very good Python linter, but also Syntastic, a vim plugin. Didn't get anywhere near fixing all of Syntastic's complaints though. Once I've cleaned up all (or at least most) of the Python lint, we can start doing regular automated lint checks and keep the code clean.
This commit is contained in:
parent
f1ed14eb33
commit
0ffe79579e
@ -2,12 +2,12 @@
|
||||
|
||||
"""
|
||||
The Gacha filter cleans out sentence pairs that have global character mean
|
||||
lower than a certain threshold.
|
||||
|
||||
Use this cleaner to produce low quantity of high quality sentence pairs.
|
||||
lower than a certain threshold.
|
||||
|
||||
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
|
||||
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
|
||||
Use this cleaner to produce low quantity of high quality sentence pairs.
|
||||
|
||||
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during
|
||||
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER.
|
||||
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf)
|
||||
|
||||
This is inspired by the global character mean that is used in the Gale-Church
|
||||
@ -27,8 +27,8 @@ USAGE:
|
||||
|
||||
$ python3 gacha_filter.py train.en train.de
|
||||
|
||||
Outputs to STDOUT a separated lines of the source and target sentence pairs.
|
||||
You can simply cut the file after that.
|
||||
Outputs to STDOUT a separated lines of the source and target sentence pairs.
|
||||
You can simply cut the file after that.
|
||||
|
||||
$ python3 gacha_filter.py train.en train.de > train.en-de
|
||||
$ cut -f1 train.en-de > train.clean.en
|
||||
@ -37,21 +37,27 @@ You can simply cut the file after that.
|
||||
You can also allow lower threshold to yield more lines:
|
||||
|
||||
$ python3 gacha_filter.py train.en train.de 0.05
|
||||
|
||||
|
||||
Default threshold is set to 0.2.
|
||||
"""
|
||||
|
||||
import io, subprocess
|
||||
import io
|
||||
import subprocess
|
||||
|
||||
red = '\033[01;31m'
|
||||
native = '\033[m'
|
||||
|
||||
|
||||
def err_msg(txt):
|
||||
return red+txt+native
|
||||
return red + txt + native
|
||||
|
||||
|
||||
def num_char(filename):
|
||||
return float(subprocess.Popen(["wc", "-m", filename],
|
||||
stdout=subprocess.PIPE).stdout.read().split()[0])
|
||||
return float(
|
||||
subprocess.Popen(
|
||||
["wc", "-m", filename],
|
||||
stdout=subprocess.PIPE).stdout.read().split()[0])
|
||||
|
||||
|
||||
def gacha_mean(sourcefile, targetfile):
|
||||
"""
|
||||
@ -60,35 +66,40 @@ def gacha_mean(sourcefile, targetfile):
|
||||
"""
|
||||
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n'))
|
||||
c = num_char(sourcefile) / num_char(targetfile)
|
||||
sys.stderr.write(err_msg('Gacha mean = '+str(c)+'\n'))
|
||||
sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n'))
|
||||
sys.stderr.write(err_msg('Filtering starts ...\n'))
|
||||
return c
|
||||
|
||||
|
||||
def io_open(path):
|
||||
"""Open text file at `path` as a read-only, with UTF-8 encoding."""
|
||||
return io.open(path, 'r', encoding='utf8')
|
||||
|
||||
|
||||
def main(sourcefile, targetfile, threshold=0.2):
|
||||
# Calculates Gacha mean.
|
||||
c = gacha_mean(sourcefile, targetfile)
|
||||
# Calculates lower and upperbound for filtering
|
||||
threshold = float(threshold)
|
||||
lowerbound = (1-threshold) * c
|
||||
upperbound = (1+threshold) * c
|
||||
|
||||
lowerbound = (1 - threshold) * c
|
||||
upperbound = (1 + threshold) * c
|
||||
|
||||
# Start filtering sentences.
|
||||
with io.open(sourcefile, 'r', encoding='utf8') as srcfin, \
|
||||
io.open(targetfile, 'r', encoding='utf8') as trgfin:
|
||||
with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin:
|
||||
for s, t in zip(srcfin, trgfin):
|
||||
if lowerbound < len(s) / float(len(t)) < upperbound:
|
||||
print(u"{}\t{}".format(s.strip(),t.strip()))
|
||||
print(u"{}\t{}".format(s.strip(), t.strip()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) not in range(3,5):
|
||||
if len(sys.argv) not in range(3, 5):
|
||||
usage_msg = err_msg('Usage: python3 %s srcfile trgfile (threshold)\n'
|
||||
% sys.argv[0])
|
||||
|
||||
|
||||
example_msg = err_msg('Example: python3 %s ~/Europarl.de-en.de '
|
||||
'~/Europarl.de-en.en 0.4\n' % sys.argv[0])
|
||||
sys.stderr.write(usage_msg)
|
||||
sys.stderr.write(example_msg)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
main(*sys.argv[1:])
|
||||
|
@ -3,36 +3,50 @@ import sys
|
||||
import numpy
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Set input embedding of <null> token to weighted average of all input embeddings')
|
||||
parser.add_argument("-p", "--nplm-python-path", type=str, dest="nplm_python_path", default='/mnt/gna0/rsennrich/tools/nplm/python')
|
||||
parser.add_argument("-i", "--input-model", type=str, dest="input_model", required=True)
|
||||
parser.add_argument("-o", "--output-model", type=str, dest="output_model", required=True)
|
||||
parser.add_argument("-n", "--null-token-index", type=int, dest="null_idx", default=-1)
|
||||
parser.add_argument("-t", "--training-ngrams", type=str, dest="training_ngrams", required=True)
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Set input embedding of <null> token to weighted average "
|
||||
"of all input embeddings"))
|
||||
parser.add_argument(
|
||||
"-p", "--nplm-python-path", type=str, dest="nplm_python_path",
|
||||
default='/mnt/gna0/rsennrich/tools/nplm/python')
|
||||
parser.add_argument(
|
||||
"-i", "--input-model", type=str, dest="input_model", required=True)
|
||||
parser.add_argument(
|
||||
"-o", "--output-model", type=str, dest="output_model", required=True)
|
||||
parser.add_argument(
|
||||
"-n", "--null-token-index", type=int, dest="null_idx", default=-1)
|
||||
parser.add_argument(
|
||||
"-t", "--training-ngrams", type=str, dest="training_ngrams",
|
||||
required=True)
|
||||
|
||||
|
||||
options = parser.parse_args()
|
||||
|
||||
sys.path.append(options.nplm_python_path)
|
||||
import nplm
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def load_model(model_file):
|
||||
return nplm.NeuralLM.from_file(model_file)
|
||||
|
||||
|
||||
def get_weights(path, length):
|
||||
counter = [0]*length
|
||||
counter = [0] * length
|
||||
for line in open(path):
|
||||
last_context = int(line.split()[-2])
|
||||
counter[last_context] += 1
|
||||
return counter
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
model = load_model(options.input_model)
|
||||
if options.null_idx == -1:
|
||||
options.null_idx = model.word_to_index_input['<null>']
|
||||
options.null_idx = model.word_to_index_input['<null>']
|
||||
sys.stderr.write('index of <null>: {0}\n'.format(options.null_idx))
|
||||
weights = numpy.array(get_weights(options.training_ngrams, len(model.input_embeddings)))
|
||||
model.input_embeddings[options.null_idx] = numpy.average(numpy.array(model.input_embeddings), weights=weights, axis=0)
|
||||
model.to_file(open(options.output_model,'w'))
|
||||
weights = numpy.array(
|
||||
get_weights(options.training_ngrams, len(model.input_embeddings)))
|
||||
model.input_embeddings[options.null_idx] = numpy.average(
|
||||
numpy.array(model.input_embeddings), weights=weights, axis=0)
|
||||
model.to_file(open(options.output_model, 'w'))
|
||||
|
@ -1,9 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from collections import Counter
|
||||
import heapq
|
||||
import logging
|
||||
import optparse
|
||||
import sys
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -12,26 +10,28 @@ BOS = "<s>"
|
||||
EOS = "</s>"
|
||||
UNK = "<unk>"
|
||||
|
||||
def replace_tags(tokens,tags,vocab):
|
||||
for i,t in enumerate(tokens):
|
||||
if not t in vocab:
|
||||
if i < len(tags):
|
||||
tokens[i] = tags[i]
|
||||
else:
|
||||
print "Error: missing tags for index i:", i
|
||||
print ' '.join(tokens)
|
||||
print ' '.join(tags)
|
||||
tokens[i] = UNK
|
||||
|
||||
def replace_unks(tokens,vocab):
|
||||
for i,t in enumerate(tokens):
|
||||
if not t in vocab:
|
||||
tokens[i] = UNK
|
||||
def replace_tags(tokens, tags, vocab):
|
||||
for i, t in enumerate(tokens):
|
||||
if t not in vocab:
|
||||
if i < len(tags):
|
||||
tokens[i] = tags[i]
|
||||
else:
|
||||
print "Error: missing tags for index i:", i
|
||||
print ' '.join(tokens)
|
||||
print ' '.join(tags)
|
||||
tokens[i] = UNK
|
||||
|
||||
|
||||
def replace_unks(tokens, vocab):
|
||||
for i, t in enumerate(tokens):
|
||||
if t not in vocab:
|
||||
tokens[i] = UNK
|
||||
|
||||
|
||||
def numberize(line, m, n, svocab, tvocab):
|
||||
line = line.split()
|
||||
source_words = line[:2*m + 1]
|
||||
source_words = line[:2 * m + 1]
|
||||
target_words = line[-n:]
|
||||
|
||||
line = ' '.join([str(svocab[item]) for item in source_words]) + ' '
|
||||
@ -40,7 +40,8 @@ def numberize(line, m, n, svocab, tvocab):
|
||||
return line
|
||||
|
||||
|
||||
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
|
||||
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,
|
||||
tlang, m, n, ofh):
|
||||
"""
|
||||
m - source context
|
||||
n - target context
|
||||
@ -51,83 +52,87 @@ def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang
|
||||
sfh = open(corpus_stem + "." + slang)
|
||||
tfh = open(corpus_stem + "." + tlang)
|
||||
afh = open(align_file)
|
||||
fhs = [sfh,tfh,afh]
|
||||
fhs = [sfh, tfh, afh]
|
||||
if tagged_stem:
|
||||
fhs.append(open(tagged_stem + "." + slang))
|
||||
fhs.append(open(tagged_stem + "." + tlang))
|
||||
fhs.append(open(tagged_stem + "." + slang))
|
||||
fhs.append(open(tagged_stem + "." + tlang))
|
||||
|
||||
count = 0
|
||||
count = 0
|
||||
ngrams = 0
|
||||
LOG.info("Extracting ngrams")
|
||||
for lines in zip(*fhs):
|
||||
stokens = lines[0][:-1].split()
|
||||
ttokens = lines[1][:-1].split()
|
||||
stokens.append(EOS)
|
||||
ttokens.append(EOS)
|
||||
if tagged_stem:
|
||||
stags = lines[3][:-1].split()
|
||||
ttags = lines[4][:-1].split()
|
||||
stags.append(EOS)
|
||||
ttags.append(EOS)
|
||||
tags.update(stags)
|
||||
tags.update(ttags)
|
||||
replace_tags(stokens,stags,svocab)
|
||||
replace_tags(ttokens,ttags,tvocab)
|
||||
else:
|
||||
replace_unks(stokens,svocab)
|
||||
replace_unks(ttokens,tvocab)
|
||||
# list aligns for each target
|
||||
# Note: align specifies source -> target
|
||||
target_aligns = [[] for t in range(len(ttokens))]
|
||||
for atoken in lines[2][:-1].split():
|
||||
spos,tpos = atoken.split("-")
|
||||
spos,tpos = int(spos), int(tpos)
|
||||
target_aligns[tpos].append(spos)
|
||||
#EOS alignment
|
||||
target_aligns[-1] = [len(stokens)-1]
|
||||
for lines in zip(*fhs):
|
||||
stokens = lines[0][:-1].split()
|
||||
ttokens = lines[1][:-1].split()
|
||||
stokens.append(EOS)
|
||||
ttokens.append(EOS)
|
||||
if tagged_stem:
|
||||
stags = lines[3][:-1].split()
|
||||
ttags = lines[4][:-1].split()
|
||||
stags.append(EOS)
|
||||
ttags.append(EOS)
|
||||
tags.update(stags)
|
||||
tags.update(ttags)
|
||||
replace_tags(stokens, stags, svocab)
|
||||
replace_tags(ttokens, ttags, tvocab)
|
||||
else:
|
||||
replace_unks(stokens, svocab)
|
||||
replace_unks(ttokens, tvocab)
|
||||
# List aligns for each target.
|
||||
# Note: align specifies source -> target
|
||||
target_aligns = [[] for t in range(len(ttokens))]
|
||||
for atoken in lines[2][:-1].split():
|
||||
spos, tpos = atoken.split("-")
|
||||
spos, tpos = int(spos), int(tpos)
|
||||
target_aligns[tpos].append(spos)
|
||||
|
||||
for tpos,spos_list in enumerate(target_aligns):
|
||||
# Affiliation heuristics - see Devlin t al. p1371
|
||||
if not spos_list:
|
||||
#tpos has no alignment, look right, then left, then right-right, then left-left etc
|
||||
rpos = tpos+1
|
||||
lpos = tpos-1
|
||||
while rpos < len(ttokens) or lpos >= 0:
|
||||
if rpos < len(ttokens) and target_aligns[rpos]:
|
||||
spos_list = target_aligns[rpos]
|
||||
break
|
||||
if lpos >= 0 and target_aligns[lpos]:
|
||||
spos_list = target_aligns[lpos]
|
||||
break
|
||||
rpos += 1
|
||||
lpos -= 1
|
||||
# EOS alignment.
|
||||
target_aligns[-1] = [len(stokens) - 1]
|
||||
|
||||
if not spos_list:
|
||||
raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
|
||||
midpos = (len(spos_list)-1) / 2
|
||||
spos = sorted(spos_list)[midpos]
|
||||
for tpos, spos_list in enumerate(target_aligns):
|
||||
# Affiliation heuristics - see Devlin t al. p1371
|
||||
if not spos_list:
|
||||
# tpos has no alignment, look right, then left, then
|
||||
# right-right, then left-left etc.
|
||||
rpos = tpos + 1
|
||||
lpos = tpos - 1
|
||||
while rpos < len(ttokens) or lpos >= 0:
|
||||
if rpos < len(ttokens) and target_aligns[rpos]:
|
||||
spos_list = target_aligns[rpos]
|
||||
break
|
||||
if lpos >= 0 and target_aligns[lpos]:
|
||||
spos_list = target_aligns[lpos]
|
||||
break
|
||||
rpos += 1
|
||||
lpos -= 1
|
||||
|
||||
if not spos_list:
|
||||
raise Exception(
|
||||
"No alignments in sentence \nSRC: " +
|
||||
lines[0][:-1] + "\nTGT: " + lines[1][:-1])
|
||||
midpos = (len(spos_list) - 1) / 2
|
||||
spos = sorted(spos_list)[midpos]
|
||||
|
||||
# source-context, target-context, predicted word
|
||||
for i in range(max(0,m-spos)):
|
||||
print>>ofh, BOS,
|
||||
#print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
|
||||
print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
|
||||
for i in range(max(0,spos+m+1-len(stokens))):
|
||||
print>>ofh, EOS,
|
||||
for i in range(max(0,n-(tpos+1))):
|
||||
print>>ofh, BOS,
|
||||
print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
|
||||
print>>ofh
|
||||
ngrams += 1
|
||||
# source-context, target-context, predicted word
|
||||
for i in range(max(0, m - spos)):
|
||||
print>>ofh, BOS,
|
||||
# print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
|
||||
print>>ofh, " ".join(
|
||||
[s for s in stokens[max(0, spos - m):spos + m + 1]]),
|
||||
for i in range(max(0, spos + m + 1 - len(stokens))):
|
||||
print>>ofh, EOS,
|
||||
for i in range(max(0, n - (tpos + 1))):
|
||||
print>>ofh, BOS,
|
||||
print>>ofh, " ".join(
|
||||
[t for t in ttokens[max(0, tpos + 1 - n):tpos + 1]]),
|
||||
print>>ofh
|
||||
ngrams += 1
|
||||
|
||||
|
||||
count += 1
|
||||
if count % 1000 == 0: sys.stderr.write(".")
|
||||
if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
|
||||
count += 1
|
||||
if count % 1000 == 0:
|
||||
sys.stderr.write(".")
|
||||
if count % 50000 == 0:
|
||||
sys.stderr.write(" [%d]\n" % count)
|
||||
ofh.close()
|
||||
sys.stderr.write("\n")
|
||||
LOG.info("Extracted %d ngrams" % ngrams)
|
||||
return tags
|
||||
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Create a test corpus, using a previously pruned vocabulary.
|
||||
#
|
||||
"""Create a test corpus, using a previously pruned vocabulary."""
|
||||
|
||||
|
||||
import logging
|
||||
import optparse
|
||||
@ -12,72 +11,84 @@ import sys
|
||||
|
||||
import extract
|
||||
|
||||
|
||||
def read_vocab(filename, offset=0):
|
||||
vocab = {}
|
||||
for i, line in enumerate(open(filename)):
|
||||
vocab[line.strip()] = i+offset
|
||||
return vocab, i+offset
|
||||
vocab = {}
|
||||
for i, line in enumerate(open(filename)):
|
||||
vocab[line.strip()] = i + offset
|
||||
return vocab, i + offset
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option("-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option("-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option(
|
||||
"-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option(
|
||||
"-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option(
|
||||
"-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option(
|
||||
"-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option(
|
||||
"-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option(
|
||||
"-w", "--working-dir", type="string", dest="working_dir")
|
||||
|
||||
parser.set_defaults(
|
||||
target_language="en",
|
||||
source_language="de",
|
||||
corpus_stem="test",
|
||||
align_file="test.align",
|
||||
working_dir="working")
|
||||
options, args = parser.parse_args(sys.argv)
|
||||
if not os.path.exists(options.working_dir):
|
||||
raise Exception(
|
||||
"Working directory '%s' not found" % options.working_dir)
|
||||
|
||||
parser.set_defaults(
|
||||
target_language = "en",
|
||||
source_language = "de",
|
||||
corpus_stem = "test",
|
||||
align_file = "test.align",
|
||||
working_dir = "working",
|
||||
)
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
if not os.path.exists(options.working_dir):
|
||||
LOG.error("Working directory '%s' not found" % working_dir)
|
||||
sys.exit(1)
|
||||
m, n = None, None
|
||||
for line in open(options.working_dir + "/info"):
|
||||
name, value = line[:-1].split()
|
||||
if name == "m":
|
||||
m = int(value)
|
||||
if name == "n":
|
||||
n = int(value)
|
||||
if m is None or n is None:
|
||||
raise Exception("Info file is incomplete.")
|
||||
|
||||
m,n = None,None
|
||||
for line in open(options.working_dir + "/info"):
|
||||
name,value = line[:-1].split()
|
||||
if name == "m": m = int(value)
|
||||
if name == "n": n = int(value)
|
||||
if m == None or n == None:
|
||||
LOG.error("info file is incomplete")
|
||||
sys.exit(1)
|
||||
tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
|
||||
svocab, offset = read_vocab(
|
||||
options.working_dir + "/vocab.source", offset + 1)
|
||||
|
||||
tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
|
||||
svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset+1)
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
|
||||
extract.get_ngrams(
|
||||
options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
m,
|
||||
n,
|
||||
ofh)
|
||||
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
|
||||
extract.get_ngrams(options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
m,
|
||||
n,
|
||||
ofh)
|
||||
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
|
||||
ngrams_file_handle = open(
|
||||
os.path.join(options.working_dir, file_stem + ".ngrams"), 'r')
|
||||
numberized_file_handle = open(numberized_file, 'w')
|
||||
|
||||
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
|
||||
ngrams_file_handle = open(options.working_dir + "/" + file_stem + ".ngrams", 'r')
|
||||
numberized_file_handle = open(numberized_file, 'w')
|
||||
|
||||
#Numberize the file
|
||||
for line in ngrams_file_handle:
|
||||
numberized_file_handle.write(extract.numberize(line, m, n, svocab, tvocab))
|
||||
|
||||
numberized_file_handle.close()
|
||||
ngrams_file_handle.close()
|
||||
# Numberize the file.
|
||||
for line in ngrams_file_handle:
|
||||
numberized_file_handle.write(extract.numberize(
|
||||
line, m, n, svocab, tvocab))
|
||||
|
||||
numberized_file_handle.close()
|
||||
ngrams_file_handle.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -11,145 +11,160 @@ import extract
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
def get_pruned_vocab(corpus,prune):
|
||||
counts = Counter()
|
||||
LOG.info("Reading vocabulary from %s" % corpus)
|
||||
lines = 0
|
||||
for line in open(corpus):
|
||||
for token in line[:-1].split():
|
||||
counts[token] += 1
|
||||
lines += 1
|
||||
if lines % 1000 == 0: sys.stderr.write(".")
|
||||
if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
|
||||
sys.stderr.write("\n")
|
||||
counts[extract.BOS] += lines
|
||||
counts[extract.EOS] += lines
|
||||
LOG.info("Vocabulary size: %d" % len(counts))
|
||||
if prune:
|
||||
return Counter(dict(counts.most_common(prune)))
|
||||
else:
|
||||
return counts
|
||||
|
||||
def get_pruned_vocab(corpus, prune):
|
||||
counts = Counter()
|
||||
LOG.info("Reading vocabulary from %s" % corpus)
|
||||
lines = 0
|
||||
for line in open(corpus):
|
||||
for token in line[:-1].split():
|
||||
counts[token] += 1
|
||||
lines += 1
|
||||
if lines % 1000 == 0:
|
||||
sys.stderr.write(".")
|
||||
if lines % 50000 == 0:
|
||||
sys.stderr.write(" [%d]\n" % lines)
|
||||
sys.stderr.write("\n")
|
||||
counts[extract.BOS] += lines
|
||||
counts[extract.EOS] += lines
|
||||
LOG.info("Vocabulary size: %d" % len(counts))
|
||||
if prune:
|
||||
return Counter(dict(counts.most_common(prune)))
|
||||
else:
|
||||
return counts
|
||||
|
||||
|
||||
def save_vocab(directory, filename, vocab):
|
||||
fh = open(directory + "/" + filename, "w")
|
||||
for word in vocab:
|
||||
print>>fh, word
|
||||
|
||||
fh = open(directory + "/" + filename, "w")
|
||||
for word in vocab:
|
||||
print>>fh, word
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option("-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option("-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
|
||||
parser.add_option("-n", "--target-context", type="int", dest="n")
|
||||
parser.add_option("-m", "--source-context", type="int", dest="m")
|
||||
parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
|
||||
parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option(
|
||||
"-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option(
|
||||
"-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option(
|
||||
"-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option("-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
|
||||
parser.add_option("-n", "--target-context", type="int", dest="n")
|
||||
parser.add_option("-m", "--source-context", type="int", dest="m")
|
||||
parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
|
||||
parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")
|
||||
|
||||
parser.set_defaults(
|
||||
target_language="en",
|
||||
source_language="de",
|
||||
corpus_stem="train.10k",
|
||||
align_file="train.10k.align",
|
||||
n=5,
|
||||
m=4,
|
||||
working_dir="working",
|
||||
sprune=16000,
|
||||
tprune=16000
|
||||
)
|
||||
options, args = parser.parse_args(sys.argv)
|
||||
|
||||
parser.set_defaults(
|
||||
target_language = "en",
|
||||
source_language = "de",
|
||||
corpus_stem = "train.10k",
|
||||
align_file = "train.10k.align",
|
||||
n = 5,
|
||||
m = 4,
|
||||
working_dir = "working",
|
||||
sprune=16000,
|
||||
tprune=16000
|
||||
)
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
if not os.path.exists(options.working_dir):
|
||||
os.makedirs(options.working_dir)
|
||||
else:
|
||||
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
|
||||
|
||||
if not os.path.exists(options.working_dir):
|
||||
os.makedirs(options.working_dir)
|
||||
else:
|
||||
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
|
||||
info_file = options.working_dir + "/info"
|
||||
if os.path.exists(info_file):
|
||||
for line in open(info_file):
|
||||
name, value = line[:-1].split()
|
||||
n_mismatch = (name == 'n' and int(value) != options.n)
|
||||
m_mismatch = (name == 'm' and int(value) != options.m)
|
||||
if n_mismatch or m_mismatch:
|
||||
LOG.error(
|
||||
"info file exists, but parameters do not match. "
|
||||
"Delete working directory and rerun.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
ifh = open(info_file, "w")
|
||||
print>>ifh, "m", options.m
|
||||
print>>ifh, "n", options.n
|
||||
ifh.close()
|
||||
|
||||
info_file = options.working_dir + "/info"
|
||||
if os.path.exists(info_file):
|
||||
for line in open(info_file):
|
||||
name,value = line[:-1].split()
|
||||
if name == "n" and int(value) != options.n or \
|
||||
name == "m" and int(value) != options.m:
|
||||
LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
|
||||
sys.exit(1)
|
||||
else:
|
||||
ifh = open(info_file,"w")
|
||||
print>>ifh,"m",options.m
|
||||
print>>ifh,"n",options.n
|
||||
ifh.close()
|
||||
scorpus = options.corpus_stem + "." + options.source_language
|
||||
tcorpus = options.corpus_stem + "." + options.target_language
|
||||
|
||||
scorpus = options.corpus_stem + "." + options.source_language
|
||||
tcorpus = options.corpus_stem + "." + options.target_language
|
||||
tvocab, svocab = None, None
|
||||
# Extract vocabulary, and prune, if required.
|
||||
svocab = get_pruned_vocab(scorpus, options.sprune)
|
||||
tvocab = get_pruned_vocab(tcorpus, options.tprune)
|
||||
|
||||
tvocab,svocab = None,None
|
||||
# Extract vocabulary, and prune, if required
|
||||
svocab = get_pruned_vocab(scorpus,options.sprune)
|
||||
tvocab = get_pruned_vocab(tcorpus,options.tprune)
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
|
||||
ofh = open(ngram_file, "w")
|
||||
|
||||
tags = extract.get_ngrams(
|
||||
options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
options.m,
|
||||
options.n,
|
||||
ofh)
|
||||
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
|
||||
ofh = open(ngram_file, "w")
|
||||
|
||||
tags = extract.get_ngrams(options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
options.m,
|
||||
options.n,
|
||||
ofh)
|
||||
# Save vocabularies.
|
||||
del svocab["<null>"]
|
||||
del tvocab["<null>"]
|
||||
del svocab["<unk>"]
|
||||
del tvocab["<unk>"]
|
||||
svocab_list = [item[0] for item in svocab.most_common()]
|
||||
tvocab_list = [item[0] for item in tvocab.most_common()]
|
||||
|
||||
# Save vocabularies
|
||||
del svocab["<null>"]
|
||||
del tvocab["<null>"]
|
||||
del svocab["<unk>"]
|
||||
del tvocab["<unk>"]
|
||||
svocab_list = [item[0] for item in svocab.most_common()]
|
||||
tvocab_list = [item[0] for item in tvocab.most_common()]
|
||||
# UNK is always the first vocabulary element. Make sure
|
||||
# it appears in position 0
|
||||
# We need to use <null> token in the chart decoder in order
|
||||
# to correctly estimate the probabilities of incomplete subphrases
|
||||
# that are not sentence initial.
|
||||
|
||||
# UNK is always the first vocabulary element. Make sure
|
||||
# it appears in position 0
|
||||
# We need to use <null> token in the chart decoder in order
|
||||
# to correctly estimate the probabilities of incomplete subphrases
|
||||
# that are not sentence initial.
|
||||
tvocab_list.insert(0, "<null>")
|
||||
tvocab_list.insert(0, "<unk>")
|
||||
svocab_list.insert(0, "<unk>")
|
||||
|
||||
tvocab_list.insert(0, "<null>")
|
||||
tvocab_list.insert(0, "<unk>")
|
||||
svocab_list.insert(0, "<unk>")
|
||||
# Get tags:
|
||||
tag_list = [item[0] for item in tags.most_common()]
|
||||
svocab_list = svocab_list + tag_list
|
||||
tvocab_list = tvocab_list + tag_list
|
||||
|
||||
#Get tags:
|
||||
tag_list = [item[0] for item in tags.most_common()]
|
||||
svocab_list = svocab_list + tag_list
|
||||
tvocab_list = tvocab_list + tag_list
|
||||
save_vocab(options.working_dir, "vocab.source", svocab_list)
|
||||
save_vocab(options.working_dir, "vocab.target", tvocab_list)
|
||||
|
||||
save_vocab(options.working_dir, "vocab.source", svocab_list)
|
||||
save_vocab(options.working_dir, "vocab.target", tvocab_list)
|
||||
# Create vocab dictionaries that map word to ID.
|
||||
tvocab_idmap = {}
|
||||
for i in range(len(tvocab_list)):
|
||||
tvocab_idmap[tvocab_list[i]] = i
|
||||
|
||||
#Create vocab dictionaries that map word to ID
|
||||
tvocab_idmap = {}
|
||||
for i in range(len(tvocab_list)):
|
||||
tvocab_idmap[tvocab_list[i]] = i
|
||||
svocab_idmap = {}
|
||||
for i in range(len(svocab_list)):
|
||||
svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
|
||||
|
||||
svocab_idmap = {}
|
||||
for i in range(len(svocab_list)):
|
||||
svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)
|
||||
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
|
||||
ngrams_file_handle = open(ngram_file, 'r')
|
||||
numberized_file_handle = open(numberized_file, 'w')
|
||||
|
||||
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
|
||||
ngrams_file_handle = open(ngram_file, 'r')
|
||||
numberized_file_handle = open(numberized_file, 'w')
|
||||
# Numberize the file.
|
||||
for line in ngrams_file_handle:
|
||||
numberized_file_handle.write(
|
||||
extract.numberize(
|
||||
line, options.m, options.n, svocab_idmap, tvocab_idmap))
|
||||
numberized_file_handle.close()
|
||||
ngrams_file_handle.close()
|
||||
|
||||
#Numberize the file
|
||||
for line in ngrams_file_handle:
|
||||
numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap))
|
||||
numberized_file_handle.close()
|
||||
ngrams_file_handle.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -1,6 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Reduces an ngrams file for training nplm to a smaller version of it with less ngrams"""
|
||||
"""Reduces an ngrams file for training nplm to a smaller version of it.
|
||||
|
||||
The smaller version will have fewer ngrams.
|
||||
"""
|
||||
|
||||
from sys import argv
|
||||
|
||||
if len(argv) != 5:
|
||||
@ -15,11 +19,11 @@ NGRAMS = int(argv[4])
|
||||
|
||||
for line in INFILE:
|
||||
line = line.split()
|
||||
line = line[START_IDX:START_IDX+NGRAMS]
|
||||
line = line[START_IDX:START_IDX + NGRAMS]
|
||||
linetowrite = ""
|
||||
for token in line:
|
||||
linetowrite = linetowrite + token + " "
|
||||
#Strip final empty space and add newline
|
||||
# Strip final empty space and add newline.
|
||||
linetowrite = linetowrite[:-1]
|
||||
linetowrite = linetowrite + '\n'
|
||||
OUTFILE.write(linetowrite)
|
||||
|
@ -7,51 +7,71 @@ import sys
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-w", "--working-dir", dest="working_dir")
|
||||
parser.add_option("-c", "--corpus", dest="corpus_stem")
|
||||
parser.add_option("-r", "--train-corpus", dest="train_stem")
|
||||
parser.add_option("-l", "--nplm-home", dest="nplm_home")
|
||||
parser.add_option("-e", "--epoch", dest="epoch", type="int")
|
||||
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
|
||||
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
|
||||
parser.add_option("-t", "--threads", dest="threads", type="int")
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-w", "--working-dir", dest="working_dir")
|
||||
parser.add_option("-c", "--corpus", dest="corpus_stem")
|
||||
parser.add_option("-r", "--train-corpus", dest="train_stem")
|
||||
parser.add_option("-l", "--nplm-home", dest="nplm_home")
|
||||
parser.add_option("-e", "--epoch", dest="epoch", type="int")
|
||||
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
|
||||
parser.add_option(
|
||||
"-b", "--minibatch-size", dest="minibatch_size", type="int")
|
||||
parser.add_option("-t", "--threads", dest="threads", type="int")
|
||||
|
||||
parser.set_defaults(
|
||||
working_dir = "working"
|
||||
,corpus_stem = "test"
|
||||
,train_stem = "train.10k"
|
||||
,nplm_home = "/home/bhaddow/tools/nplm"
|
||||
,epoch=10
|
||||
,ngram_size = 14
|
||||
,minibatch_size=1000
|
||||
,threads=8
|
||||
)
|
||||
parser.set_defaults(
|
||||
working_dir="working",
|
||||
corpus_stem="test",
|
||||
train_stem="train.10k",
|
||||
nplm_home="/home/bhaddow/tools/nplm",
|
||||
epoch=10,
|
||||
ngram_size=14,
|
||||
minibatch_size=1000,
|
||||
threads=8)
|
||||
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
options, _ = parser.parse_args(sys.argv)
|
||||
|
||||
model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
|
||||
model_file = model_prefix + "." + str(options.epoch)
|
||||
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
|
||||
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
|
||||
vocab_file = options.working_dir + "/vocab"
|
||||
model_prefix = (
|
||||
options.working_dir + "/" + options.train_stem + ".model.nplm")
|
||||
model_file = model_prefix + "." + str(options.epoch)
|
||||
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
|
||||
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
|
||||
vocab_file = options.working_dir + "/vocab"
|
||||
|
||||
#TODO: Get ngram size from info file.
|
||||
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
|
||||
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
|
||||
ret = subprocess.call(prep_args)
|
||||
if ret: raise Exception("Preparation failed")
|
||||
# TODO: Get ngram size from info file.
|
||||
prep_args = [
|
||||
options.nplm_home + "/src/prepareNeuralLM",
|
||||
"--train_text", test_file,
|
||||
"--ngram_size", str(options.ngram_size),
|
||||
"--ngramize", "0",
|
||||
"--words_file", vocab_file,
|
||||
"--train_file", prep_file,
|
||||
]
|
||||
ret = subprocess.call(prep_args)
|
||||
if ret:
|
||||
raise Exception("Preparation failed")
|
||||
|
||||
test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
|
||||
model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
|
||||
ret = subprocess.call(test_args)
|
||||
if ret: raise Exception("Testing failed")
|
||||
test_args = [
|
||||
options.nplm_home + "/src/testNeuralNetwork",
|
||||
"--test_file", prep_file,
|
||||
"--model_file", model_file,
|
||||
"--minibatch_size", str(options.minibatch_size),
|
||||
"--num_threads", str(options.threads),
|
||||
]
|
||||
ret = subprocess.call(test_args)
|
||||
if ret:
|
||||
raise Exception("Testing failed")
|
||||
|
||||
#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
|
||||
# $ROOT/src/prepareNeuralLM --train_text $TESTFILE1 \
|
||||
# --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE \
|
||||
# --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
|
||||
|
||||
# $ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams \
|
||||
# --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE \
|
||||
# --num_threads $THREADS || exit 1
|
||||
|
||||
#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
main()
|
||||
|
@ -8,7 +8,9 @@ import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-w", "--working-dir", dest="working_dir")
|
||||
parser.add_argument("-c", "--corpus", dest="corpus_stem")
|
||||
@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
|
||||
parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
|
||||
parser.add_argument("-s", "--noise", dest="noise", type=int)
|
||||
parser.add_argument("-d", "--hidden", dest="hidden", type=int)
|
||||
parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
|
||||
parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
|
||||
parser.add_argument(
|
||||
"-i", "--input-embedding", dest="input_embedding", type=int)
|
||||
parser.add_argument(
|
||||
"-o", "--output-embedding", dest="output_embedding", type=int)
|
||||
parser.add_argument("-t", "--threads", dest="threads", type=int)
|
||||
parser.add_argument("-m", "--output-model", dest="output_model")
|
||||
parser.add_argument("-r", "--output-dir", dest="output_dir")
|
||||
@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
|
||||
|
||||
|
||||
parser.set_defaults(
|
||||
working_dir = "working"
|
||||
,corpus_stem = "train.10k"
|
||||
,nplm_home = "/home/bhaddow/tools/nplm"
|
||||
,epochs = 10
|
||||
,ngram_size = 14
|
||||
,minibatch_size=1000
|
||||
,noise=100
|
||||
,hidden=750
|
||||
,input_embedding=150
|
||||
,output_embedding=150
|
||||
,threads=1
|
||||
,output_model = "train.10k"
|
||||
,output_dir = None
|
||||
,config_options_file = "config"
|
||||
,log_file = "log"
|
||||
,validation_file = None
|
||||
,activation_fn = "rectifier"
|
||||
,learning_rate = 1
|
||||
,input_words_file = None
|
||||
,output_words_file = None
|
||||
,input_vocab_size = 0
|
||||
,output_vocab_size = 0
|
||||
working_dir="working",
|
||||
corpus_stem="train.10k",
|
||||
nplm_home="/home/bhaddow/tools/nplm",
|
||||
epochs=10,
|
||||
ngram_size=14,
|
||||
minibatch_size=1000,
|
||||
noise=100,
|
||||
hidden=750,
|
||||
input_embedding=150,
|
||||
output_embedding=150,
|
||||
threads=1,
|
||||
output_model="train.10k",
|
||||
output_dir=None,
|
||||
config_options_file="config",
|
||||
log_file="log",
|
||||
validation_file=None,
|
||||
activation_fn="rectifier",
|
||||
learning_rate=1,
|
||||
input_words_file=None,
|
||||
output_words_file=None,
|
||||
input_vocab_size=0,
|
||||
output_vocab_size=0
|
||||
)
|
||||
|
||||
|
||||
def main(options):
|
||||
|
||||
vocab_command = []
|
||||
if options.input_words_file is not None:
|
||||
vocab_command += ['--input_words_file', options.input_words_file]
|
||||
if options.output_words_file is not None:
|
||||
vocab_command += ['--output_words_file', options.output_words_file]
|
||||
if options.input_vocab_size:
|
||||
vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
|
||||
if options.output_vocab_size:
|
||||
vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
|
||||
vocab_command = []
|
||||
if options.input_words_file is not None:
|
||||
vocab_command += ['--input_words_file', options.input_words_file]
|
||||
if options.output_words_file is not None:
|
||||
vocab_command += ['--output_words_file', options.output_words_file]
|
||||
if options.input_vocab_size:
|
||||
vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
|
||||
if options.output_vocab_size:
|
||||
vocab_command += [
|
||||
'--output_vocab_size', str(options.output_vocab_size)]
|
||||
|
||||
# Set up validation command variable to use with validation set.
|
||||
validations_command = []
|
||||
if options.validation_file is not None:
|
||||
validations_command =["--validation_file", (options.validation_file + ".numberized")]
|
||||
# Set up validation command variable to use with validation set.
|
||||
validations_command = []
|
||||
if options.validation_file is not None:
|
||||
validations_command = [
|
||||
"--validation_file", (options.validation_file + ".numberized")]
|
||||
|
||||
# In order to allow for different models to be trained after the same
|
||||
# preparation step, we should provide an option for multiple output directories
|
||||
# If we have not set output_dir, set it to the same thing as the working dir
|
||||
# In order to allow for different models to be trained after the same
|
||||
# preparation step, we should provide an option for multiple output
|
||||
# directories.
|
||||
# If we have not set output_dir, set it to the same thing as the working
|
||||
# dir.
|
||||
|
||||
if options.output_dir is None:
|
||||
options.output_dir = options.working_dir
|
||||
else:
|
||||
# Create output dir if necessary
|
||||
if not os.path.exists(options.output_dir):
|
||||
os.makedirs(options.output_dir)
|
||||
if options.output_dir is None:
|
||||
options.output_dir = options.working_dir
|
||||
else:
|
||||
# Create output dir if necessary
|
||||
if not os.path.exists(options.output_dir):
|
||||
os.makedirs(options.output_dir)
|
||||
|
||||
config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
|
||||
log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
|
||||
log_file_write = open(log_file, 'w')
|
||||
config_file_write = open(config_file, 'w')
|
||||
config_file = os.path.join(
|
||||
options.output_dir,
|
||||
options.config_options_file + '-' + options.output_model)
|
||||
log_file = os.path.join(
|
||||
options.output_dir, options.log_file + '-' + options.output_model)
|
||||
log_file_write = open(log_file, 'w')
|
||||
config_file_write = open(config_file, 'w')
|
||||
|
||||
config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
|
||||
config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
|
||||
|
||||
in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
|
||||
in_file = os.path.join(
|
||||
options.working_dir,
|
||||
os.path.basename(options.corpus_stem) + ".numberized")
|
||||
|
||||
model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
|
||||
train_args = [options.nplm_home + "/src/trainNeuralNetwork",
|
||||
"--train_file", in_file,
|
||||
"--num_epochs", str(options.epochs),
|
||||
"--model_prefix", model_prefix,
|
||||
"--learning_rate", str(options.learning_rate),
|
||||
"--minibatch_size", str(options.minibatch_size),
|
||||
"--num_noise_samples", str(options.noise),
|
||||
"--num_hidden", str(options.hidden),
|
||||
"--input_embedding_dimension", str(options.input_embedding),
|
||||
"--output_embedding_dimension", str(options.output_embedding),
|
||||
"--num_threads", str(options.threads),
|
||||
"--activation_function", options.activation_fn] + validations_command + vocab_command
|
||||
print("Train model command: ")
|
||||
print(', '.join(train_args))
|
||||
model_prefix = os.path.join(
|
||||
options.output_dir, options.output_model + ".model.nplm")
|
||||
train_args = [
|
||||
options.nplm_home + "/src/trainNeuralNetwork",
|
||||
"--train_file", in_file,
|
||||
"--num_epochs", str(options.epochs),
|
||||
"--model_prefix", model_prefix,
|
||||
"--learning_rate", str(options.learning_rate),
|
||||
"--minibatch_size", str(options.minibatch_size),
|
||||
"--num_noise_samples", str(options.noise),
|
||||
"--num_hidden", str(options.hidden),
|
||||
"--input_embedding_dimension", str(options.input_embedding),
|
||||
"--output_embedding_dimension", str(options.output_embedding),
|
||||
"--num_threads", str(options.threads),
|
||||
"--activation_function",
|
||||
options.activation_fn,
|
||||
] + validations_command + vocab_command
|
||||
print("Train model command: ")
|
||||
print(', '.join(train_args))
|
||||
|
||||
config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
|
||||
config_file_write.close()
|
||||
config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
|
||||
config_file_write.close()
|
||||
|
||||
log_file_write.write("Training output:\n")
|
||||
ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
|
||||
if ret:
|
||||
raise Exception("Training failed")
|
||||
log_file_write.write("Training output:\n")
|
||||
ret = subprocess.call(
|
||||
train_args, stdout=log_file_write, stderr=log_file_write)
|
||||
if ret:
|
||||
raise Exception("Training failed")
|
||||
|
||||
log_file_write.close()
|
||||
|
||||
log_file_write.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
options = parser.parse_args()
|
||||
main(options)
|
||||
|
||||
options = parser.parse_args()
|
||||
main(options)
|
||||
|
@ -2,15 +2,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
|
||||
|
||||
# This script creates tables that store phrase pair frequencies rather than probabilities.
|
||||
# These count tables can be used for a delayed, online computation of the original phrase translation features
|
||||
# The benefit is that models can be combined quickly, with the same results as if we trained a model on the concatenation of all data (excepting differences in word alignment).
|
||||
# Also, each model can be given a weight, which is applied to all frequencies of the model for the combination.
|
||||
# This script creates tables that store phrase pair frequencies rather than
|
||||
# probabilities.
|
||||
#
|
||||
# These count tables can be used for a delayed, online computation of the
|
||||
# original phrase translation features.
|
||||
#
|
||||
# The benefit is that models can be combined quickly, with the same results
|
||||
# as if we trained a model on the concatenation of all data (excepting
|
||||
# differences in word alignment).
|
||||
#
|
||||
# Also, each model can be given a weight, which is applied to all frequencies
|
||||
# of the model for the combination.
|
||||
|
||||
# Note: the input phrase table must have alignment information;
|
||||
# it must be unsmoothed;
|
||||
# additionally, the phrase table type PhraseDictionaryMultiModelCounts requires the lexical counts files lex.counts.e2f and lex.counts.f2e (obtained by using the option --write-lexical-counts in train-model.perl)
|
||||
# The results may differ from training on the concatenation of all data due to differences in word alignment, and rounding errors.
|
||||
# additionally, the phrase table type PhraseDictionaryMultiModelCounts
|
||||
# requires the lexical counts files lex.counts.e2f and lex.counts.f2e
|
||||
# (obtained by using the option --write-lexical-counts in
|
||||
# train-model.perl)
|
||||
# The results may differ from training on the concatenation of all data due
|
||||
# to differences in word alignment, and rounding errors.
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
@ -21,11 +33,15 @@ from tempfile import NamedTemporaryFile
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
if len(sys.argv) < 3 or len(sys.argv) > 4:
|
||||
sys.stderr.write('Usage: ' + sys.argv[0] + ' in_file out_path [prune_count]\nThis script will create the files out_path/count-table.gz and out_path/count-table-target.gz\n')
|
||||
sys.stderr.write(
|
||||
'Usage: ' +
|
||||
sys.argv[0] + " in_file out_path [prune_count]\n"
|
||||
"This script will create the files out_path/count-table.gz and "
|
||||
"out_path/count-table-target.gz\n")
|
||||
exit()
|
||||
|
||||
|
||||
def handle_file(filename,action,fileobj=None,mode='r'):
|
||||
def handle_file(filename, action, fileobj=None, mode='r'):
|
||||
"""support reading either from stdin, plain file or gzipped file"""
|
||||
|
||||
if action == 'open':
|
||||
@ -33,21 +49,23 @@ def handle_file(filename,action,fileobj=None,mode='r'):
|
||||
if mode == 'r':
|
||||
mode = 'rb'
|
||||
|
||||
if mode == 'rb' and not filename == '-' and not os.path.exists(filename):
|
||||
if os.path.exists(filename+'.gz'):
|
||||
filename = filename+'.gz'
|
||||
if mode == 'rb' and filename != '-' and not os.path.exists(filename):
|
||||
if os.path.exists(filename + '.gz'):
|
||||
filename = filename + '.gz'
|
||||
else:
|
||||
sys.stderr.write('Error: unable to open file. ' + filename + ' - aborting.\n')
|
||||
sys.stderr.write(
|
||||
"Error: unable to open file. " +
|
||||
filename + " - aborting.\n")
|
||||
exit()
|
||||
|
||||
if filename.endswith('.gz'):
|
||||
fileobj = gzip.open(filename,mode)
|
||||
fileobj = gzip.open(filename, mode)
|
||||
|
||||
elif filename == '-':
|
||||
fileobj = sys.stdin
|
||||
|
||||
else:
|
||||
fileobj = open(filename,mode)
|
||||
fileobj = open(filename, mode)
|
||||
|
||||
return fileobj
|
||||
|
||||
@ -59,10 +77,13 @@ def sort_and_uniq(infile, outfile):
|
||||
|
||||
cmd = ['sort', infile]
|
||||
fobj = handle_file(outfile, 'open', mode='w')
|
||||
sys.stderr.write('Executing: LC_ALL=C ' + ' '.join(cmd) + ' | uniq | gzip -c > ' + outfile + '\n')
|
||||
p_sort = Popen(cmd, env={'LC_ALL':'C'}, stdout=PIPE)
|
||||
p_uniq = Popen(['uniq'], stdin = p_sort.stdout, stdout=PIPE)
|
||||
p_compress = Popen(['gzip', '-c'], stdin = p_uniq.stdout, stdout=fobj)
|
||||
sys.stderr.write(
|
||||
"Executing: LC_ALL=C " +
|
||||
' '.join(cmd) +
|
||||
' | uniq | gzip -c > ' + outfile + '\n')
|
||||
p_sort = Popen(cmd, env={'LC_ALL': 'C'}, stdout=PIPE)
|
||||
p_uniq = Popen(['uniq'], stdin=p_sort.stdout, stdout=PIPE)
|
||||
p_compress = Popen(['gzip', '-c'], stdin=p_uniq.stdout, stdout=fobj)
|
||||
p_compress.wait()
|
||||
fobj.close()
|
||||
|
||||
@ -89,9 +110,9 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
|
||||
try:
|
||||
fst = comments[2]
|
||||
except IndexError:
|
||||
fst = str(int(round(float(scores[0])*float(ft)))).encode()
|
||||
fst = str(int(round(float(scores[0]) * float(ft)))).encode()
|
||||
|
||||
line[2] = b' '.join([fst,ft,fs])
|
||||
line[2] = b' '.join([fst, ft, fs])
|
||||
|
||||
if prune:
|
||||
if current_source != source:
|
||||
@ -106,8 +127,10 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
|
||||
else:
|
||||
countobj.write(b' ||| '.join(line))
|
||||
|
||||
# target count file
|
||||
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n' # if you use string formatting to make this look nicer, you may break Python 3 compatibility.
|
||||
# Target count file.
|
||||
# If you use string formatting to make this look nicer, you may break
|
||||
# Python 3 compatibility.
|
||||
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n'
|
||||
countobj_target.write(tline)
|
||||
|
||||
if prune:
|
||||
@ -119,7 +142,8 @@ def create_count_lines(fobj, countobj, countobj_target, prune=0):
|
||||
|
||||
def write_batch(store_lines, outfile, prune):
|
||||
top20 = sorted(store_lines, reverse=True)[:prune]
|
||||
for score, original_pos, store_line in sorted(top20, key = lambda x: x[1]): #write in original_order
|
||||
# Write in original_order.
|
||||
for score, original_pos, store_line in sorted(top20, key=lambda x: x[1]):
|
||||
outfile.write(store_line)
|
||||
|
||||
|
||||
@ -130,21 +154,28 @@ if __name__ == '__main__':
|
||||
else:
|
||||
prune = 0
|
||||
|
||||
fileobj = handle_file(sys.argv[1],'open')
|
||||
fileobj = handle_file(sys.argv[1], 'open')
|
||||
out_path = sys.argv[2]
|
||||
|
||||
count_table_file = gzip.open(os.path.join(out_path,'count-table.gz'), 'w')
|
||||
count_table_target_file = os.path.join(out_path,'count-table-target.gz')
|
||||
count_table_file = gzip.open(
|
||||
os.path.join(out_path, 'count-table.gz'), 'w')
|
||||
count_table_target_file = os.path.join(out_path, 'count-table-target.gz')
|
||||
|
||||
count_table_target_file_temp = NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
sys.stderr.write('Creating temporary file for unsorted target counts file: ' + count_table_target_file_temp.name + '\n')
|
||||
sys.stderr.write(
|
||||
"Creating temporary file for unsorted target counts file: " +
|
||||
count_table_target_file_temp.name + '\n')
|
||||
|
||||
create_count_lines(fileobj, count_table_file, count_table_target_file_temp, prune)
|
||||
create_count_lines(
|
||||
fileobj, count_table_file, count_table_target_file_temp, prune)
|
||||
count_table_target_file_temp.close()
|
||||
sys.stderr.write('Finished writing, now re-sorting and compressing target count file\n')
|
||||
sys.stderr.write(
|
||||
"Finished writing, "
|
||||
"now re-sorting and compressing target count file.\n")
|
||||
|
||||
sort_and_uniq(count_table_target_file_temp.name, count_table_target_file)
|
||||
sort_and_uniq(
|
||||
count_table_target_file_temp. name, count_table_target_file)
|
||||
os.remove(count_table_target_file_temp.name)
|
||||
sys.stderr.write('Done\n')
|
||||
|
||||
|
@ -1,10 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# add flexibility scores to a phrase table half
|
||||
# you usually don't have to call this script directly; to add flexibility scores to your model, run train-model.perl with the option "--flexibility-score" (will only affect steps 5 and 6)
|
||||
# usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file
|
||||
|
||||
# author: Rico Sennrich
|
||||
|
||||
"""Add flexibility scores to a phrase table half.
|
||||
|
||||
You usually don't have to call this script directly; to add flexibility
|
||||
scores to your model, run train-model.perl with the option
|
||||
"--flexibility-score" (will only affect steps 5 and 6).
|
||||
|
||||
Usage:
|
||||
python flexibility_score.py extract.context(.inv).sorted \
|
||||
[--Inverse] [--Hierarchical] < phrasetable > output_file
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
@ -12,26 +21,28 @@ import sys
|
||||
import gzip
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class FlexScore:
|
||||
|
||||
def __init__(self, inverted, hierarchical):
|
||||
self.inverted = inverted
|
||||
self.hierarchical = hierarchical
|
||||
|
||||
def store_pt(self, obj):
|
||||
"""Store line in dictionary.
|
||||
|
||||
def store_pt(self,obj):
|
||||
"""store line in dictionary; if we work with inverted phrase table, swap the two phrases"""
|
||||
src,target = obj[0],obj[1]
|
||||
If we work with inverted phrase table, swap the two phrases.
|
||||
"""
|
||||
src, target = obj[0], obj[1]
|
||||
|
||||
if self.inverted:
|
||||
src, target = target, src
|
||||
|
||||
self.phrase_pairs[src][target] = obj
|
||||
|
||||
|
||||
def update_contextcounts(self, obj):
|
||||
"""count the number of contexts a phrase pair occurs in"""
|
||||
src,target = obj[0],obj[1]
|
||||
src, target = obj[0], obj[1]
|
||||
self.context_counts[src][target] += 1
|
||||
if obj[-1].startswith(b'<'):
|
||||
self.context_counts_l[src][target] += 1
|
||||
@ -40,18 +51,21 @@ class FlexScore:
|
||||
elif obj[-1].startswith(b'v'):
|
||||
self.context_counts_d[src][target] += 1
|
||||
else:
|
||||
sys.stderr.write(b'\nERROR in line: {0}\n'.format(b' ||| '.join(obj)))
|
||||
sys.stderr.write(b'ERROR: expecting one of \'<, >, v\' as context marker in context extract file\n')
|
||||
sys.stderr.write(
|
||||
b"\nERROR in line: {0}\n".format(b' ||| '.join(obj)))
|
||||
sys.stderr.write(
|
||||
b"ERROR: expecting one of '<, >, v' as context marker "
|
||||
"in context extract file.\n")
|
||||
raise ValueError
|
||||
|
||||
|
||||
def traverse_incrementally(self,phrasetable,flexfile):
|
||||
"""traverse phrase table and phrase extract file (with context information) incrementally
|
||||
without storing all in memory."""
|
||||
def traverse_incrementally(self, phrasetable, flexfile):
|
||||
"""Traverse phrase table and phrase extract file (with context
|
||||
information) incrementally without storing all in memory.
|
||||
"""
|
||||
|
||||
increment = b''
|
||||
old_increment = 1
|
||||
stack = ['']*2
|
||||
stack = [''] * 2
|
||||
|
||||
# which phrase to use for sorting
|
||||
sort_pt = 0
|
||||
@ -63,10 +77,10 @@ class FlexScore:
|
||||
old_increment = increment
|
||||
|
||||
self.phrase_pairs = defaultdict(dict)
|
||||
self.context_counts = defaultdict(lambda:defaultdict(int))
|
||||
self.context_counts_l = defaultdict(lambda:defaultdict(int))
|
||||
self.context_counts_r = defaultdict(lambda:defaultdict(int))
|
||||
self.context_counts_d = defaultdict(lambda:defaultdict(int))
|
||||
self.context_counts = defaultdict(lambda: defaultdict(int))
|
||||
self.context_counts_l = defaultdict(lambda: defaultdict(int))
|
||||
self.context_counts_r = defaultdict(lambda: defaultdict(int))
|
||||
self.context_counts_d = defaultdict(lambda: defaultdict(int))
|
||||
|
||||
if stack[0]:
|
||||
self.store_pt(stack[0])
|
||||
@ -96,30 +110,32 @@ class FlexScore:
|
||||
|
||||
yield 1
|
||||
|
||||
|
||||
def main(self,phrasetable,flexfile,output_object):
|
||||
def main(self, phrasetable, flexfile, output_object):
|
||||
|
||||
i = 0
|
||||
sys.stderr.write('Incrementally loading phrase table and adding flexibility score...')
|
||||
for block in self.traverse_incrementally(phrasetable,flexfile):
|
||||
sys.stderr.write(
|
||||
"Incrementally loading phrase table "
|
||||
"and adding flexibility score...")
|
||||
for block in self.traverse_incrementally(phrasetable, flexfile):
|
||||
|
||||
self.flexprob_l = normalize(self.context_counts_l)
|
||||
self.flexprob_r = normalize(self.context_counts_r)
|
||||
self.flexprob_d = normalize(self.context_counts_d)
|
||||
|
||||
for src in sorted(self.phrase_pairs, key = lambda x: x + b' |'):
|
||||
for target in sorted(self.phrase_pairs[src], key = lambda x: x + b' |'):
|
||||
# TODO: Why this lambda? It doesn't affect sorting, does it?
|
||||
sortkey = lambda x: x + b' |'
|
||||
for src in sorted(self.phrase_pairs, key=sortkey):
|
||||
for target in sorted(self.phrase_pairs[src], key=sortkey):
|
||||
|
||||
if not i % 1000000:
|
||||
if i % 1000000 == 0:
|
||||
sys.stderr.write('.')
|
||||
i += 1
|
||||
|
||||
outline = self.write_phrase_table(src,target)
|
||||
outline = self.write_phrase_table(src, target)
|
||||
output_object.write(outline)
|
||||
sys.stderr.write('done\n')
|
||||
|
||||
|
||||
def write_phrase_table(self,src,target):
|
||||
def write_phrase_table(self, src, target):
|
||||
|
||||
line = self.phrase_pairs[src][target]
|
||||
flexscore_l = b"{0:.6g}".format(self.flexprob_l[src][target])
|
||||
@ -136,7 +152,6 @@ class FlexScore:
|
||||
return b' ||| '.join(line) + b'\n'
|
||||
|
||||
|
||||
|
||||
def normalize(d):
|
||||
|
||||
out_dict = defaultdict(dict)
|
||||
@ -145,7 +160,7 @@ def normalize(d):
|
||||
total = sum(d[src].values())
|
||||
|
||||
for target in d[src]:
|
||||
out_dict[src][target] = d[src][target]/total
|
||||
out_dict[src][target] = d[src][target] / total
|
||||
|
||||
return out_dict
|
||||
|
||||
@ -153,7 +168,10 @@ def normalize(d):
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) < 1:
|
||||
sys.stderr.write('Usage: python flexibility_score.py extract.context(.inv).sorted [--Inverse] [--Hierarchical] < phrasetable > output_file\n')
|
||||
sys.stderr.write(
|
||||
"Usage: "
|
||||
"python flexibility_score.py extract.context(.inv).sorted "
|
||||
"[--Inverse] [--Hierarchical] < phrasetable > output_file\n")
|
||||
exit()
|
||||
|
||||
flexfile = sys.argv[1]
|
||||
@ -168,4 +186,4 @@ if __name__ == '__main__':
|
||||
hierarchical = False
|
||||
|
||||
FS = FlexScore(inverted, hierarchical)
|
||||
FS.main(sys.stdin,gzip.open(flexfile,'r'),sys.stdout)
|
||||
FS.main(sys.stdin, gzip.open(flexfile, 'r'), sys.stdout)
|
||||
|
Loading…
Reference in New Issue
Block a user