Barry's training scripts with some minor changes by me

This commit is contained in:
Abmayne 2014-09-16 14:23:47 +01:00 committed by Paul Baltescu
parent 97b7c766e4
commit 4af68a0d1a
7 changed files with 476 additions and 0 deletions

View File

@ -0,0 +1,9 @@
Example usage:
#create training and test corpus
/home/abmayne/code/deepathon/nnjm/extract_training.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.clean.10k --target-language cs --source-language en --align corpus/europarl.clean.10k.align
/home/abmayne/code/deepathon/nnjm/extract_test.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.test.10k --target-language cs --source-language en --align corpus/europarl.test.10k.align
#Train and test different language models with basic nplm training
/home/abmayne/code/deepathon/nnjm/train_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.clean.10k --minibatch-size 128 --epochs 40 --output-model europarl.10k.bbn --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --hidden 0 --threads 1 --output-model europarl.10k.1layer
/home/abmayne/code/deepathon/nnjm/test_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.test.10k --train-corpus europarl.10k.1layer --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --threads 1

116
scripts/nplm-training/extract.py Executable file
View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
from collections import Counter
import heapq
import logging
import optparse
import sys
LOG = logging.getLogger(__name__)
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"
def replace_tags(tokens,tags,vocab):
for i,t in enumerate(tokens):
if not t in vocab:
if i < len(tags):
tokens[i] = tags[i]
else:
print "Error: missing tags for index i:", i
print ' '.join(tokens)
print ' '.join(tags)
tokens[i] = UNK
def replace_unks(tokens,vocab):
for i,t in enumerate(tokens):
if not t in vocab:
tokens[i] = UNK
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
"""
m - source context
n - target context
returns set of tags used
"""
tags = set()
sfh = open(corpus_stem + "." + slang)
tfh = open(corpus_stem + "." + tlang)
afh = open(align_file)
fhs = [sfh,tfh,afh]
if tagged_stem:
fhs.append(open(tagged_stem + "." + slang))
fhs.append(open(tagged_stem + "." + tlang))
count = 0
ngrams = 0
LOG.info("Extracting ngrams")
for lines in zip(*fhs):
stokens = lines[0][:-1].split()
ttokens = lines[1][:-1].split()
if tagged_stem:
stags = lines[3][:-1].split()
ttags = lines[4][:-1].split()
tags.update(stags)
tags.update(ttags)
replace_tags(stokens,stags,svocab)
replace_tags(ttokens,ttags,tvocab)
else:
replace_unks(stokens,svocab)
replace_unks(ttokens,tvocab)
# list aligns for each target
# Note: align specifies source -> target
target_aligns = [[] for t in range(len(ttokens))]
for atoken in lines[2][:-1].split():
spos,tpos = atoken.split("-")
spos,tpos = int(spos), int(tpos)
target_aligns[tpos].append(spos)
for tpos,spos_list in enumerate(target_aligns):
# Affiliation heuristics - see Devlin t al. p1371
if not spos_list:
#tpos has no alignment, look right, then left, then right-right, then left-left etc
rpos = tpos+1
lpos = tpos-1
while rpos < len(ttokens) or lpos >= 0:
if rpos < len(ttokens) and target_aligns[rpos]:
spos_list = target_aligns[rpos]
break
if lpos >= 0 and target_aligns[lpos]:
spos_list = target_aligns[lpos]
break
rpos += 1
lpos -= 1
if not spos_list:
raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
spos = (max(spos_list) + min(spos_list)) / 2
# source-context, target-context, predicted word
for i in range(max(0,m-spos)):
print>>ofh, BOS,
#print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
for i in range(max(0,spos+m+1-len(stokens))):
print>>ofh, EOS,
for i in range(max(0,n-(tpos+1))):
print>>ofh, BOS,
print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
print>>ofh
ngrams += 1
count += 1
if count % 1000 == 0: sys.stderr.write(".")
if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
ofh.close()
sys.stderr.write("\n")
LOG.info("Extracted %d ngrams" % ngrams)
return tags

View File

@ -0,0 +1,73 @@
#!/usr/bin/env python
#
# Create a test corpus, using a previously pruned vocabulary.
#
import logging
import optparse
import os
import os.path
import sys
import extract
def read_vocab(filename):
vocab = set()
for line in open(filename):
vocab.add(line[:-1])
return vocab
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-e", "--target-language", type="string", dest="target_language")
parser.add_option("-f", "--source-language", type="string", dest="source_language")
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option("-a", "--align", type="string", dest="align_file")
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
parser.set_defaults(
target_language = "en",
source_language = "de",
corpus_stem = "test",
align_file = "test.align",
working_dir = "working",
)
options,args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
LOG.error("Working directory '%s' not found" % working_dir)
sys.exit(1)
m,n = None,None
for line in open(options.working_dir + "/info"):
name,value = line[:-1].split()
if name == "m": m = int(value)
if name == "n": n = int(value)
if m == None or n == None:
LOG.error("info file is incomplete")
sys.exit(1)
svocab = read_vocab(options.working_dir + "/vocab.source")
tvocab = read_vocab(options.working_dir + "/vocab.target")
file_stem = os.path.basename(options.corpus_stem)
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
extract.get_ngrams(options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
m,
n,
ofh)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,115 @@
#!/usr/bin/env python
from collections import Counter
import logging
import optparse
import os
import os.path
import sys
import extract
LOG = logging.getLogger(__name__)
def get_pruned_vocab(corpus,prune):
counts = Counter()
LOG.info("Reading vocabulary from %s" % corpus)
lines = 0
for line in open(corpus):
for token in line[:-1].split():
counts[token] += 1
lines += 1
if lines % 1000 == 0: sys.stderr.write(".")
if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
sys.stderr.write("\n")
LOG.info("Vocabulary size: %d" % len(counts))
if prune:
return set([c[0] for c in counts.most_common(prune)])
else:
return set(counts.keys())
def save_vocab(directory, filename, vocab):
fh = open(directory + "/" + filename, "w")
for word in vocab:
print>>fh, word
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-e", "--target-language", type="string", dest="target_language")
parser.add_option("-f", "--source-language", type="string", dest="source_language")
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option("-a", "--align", type="string", dest="align_file")
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
parser.add_option("-n", "--target-context", type="int", dest="n")
parser.add_option("-m", "--source-context", type="int", dest="m")
parser.add_option("-p", "--prune-vocab", type="int", dest="prune")
parser.set_defaults(
target_language = "en",
source_language = "de",
corpus_stem = "train.10k",
align_file = "train.10k.align",
n = 5,
m = 4,
working_dir = "working",
prune=16000
)
options,args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
os.makedirs(options.working_dir)
else:
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
info_file = options.working_dir + "/info"
if os.path.exists(info_file):
for line in open(info_file):
name,value = line[:-1].split()
if name == "n" and int(value) != options.n or \
name == "m" and int(value) != options.m:
LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
sys.exit(1)
else:
ifh = open(info_file,"w")
print>>ifh,"m",options.m
print>>ifh,"n",options.n
ifh.close()
scorpus = options.corpus_stem + "." + options.source_language
tcorpus = options.corpus_stem + "." + options.target_language
tvocab,svocab = None,None
# Extract vocabulary, and prune, if required
svocab = get_pruned_vocab(scorpus,options.prune)
tvocab = get_pruned_vocab(tcorpus,options.prune)
file_stem = os.path.basename(options.corpus_stem)
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
ofh = open(ngram_file, "w")
tags = extract.get_ngrams(options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
options.m,
options.n,
ofh)
# Save vocabularies
svocab.add(extract.BOS)
tvocab.add(extract.EOS)
save_vocab(options.working_dir, "vocab.source", svocab)
save_vocab(options.working_dir, "vocab.target", tvocab)
vocab = svocab.union(tvocab)
vocab.update(tags)
save_vocab(options.working_dir, "vocab", vocab)
if __name__ == "__main__":
main()

17
scripts/nplm-training/tag.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/sh
WRAP_DIR=~/moses.new/scripts/training/wrappers/
tagger=$WRAP_DIR/make-factor-en-pos.mxpost.perl
lang=en
for stem in test train.10k train.100k; do
$tagger -mxpost /home/pkoehn/statmt/project/mxpost $stem.$lang $stem.tagged.$lang /tmp
done
tagger=$WRAP_DIR/make-factor-de-pos.perl
lang=de
for stem in test train.10k train.100k; do
$tagger $stem.$lang $stem.tagged.$lang /tmp
done

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
import logging
import optparse
import subprocess
import sys
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-w", "--working-dir", dest="working_dir")
parser.add_option("-c", "--corpus", dest="corpus_stem")
parser.add_option("-r", "--train-corpus", dest="train_stem")
parser.add_option("-l", "--nplm-home", dest="nplm_home")
parser.add_option("-e", "--epoch", dest="epoch", type="int")
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
parser.add_option("-t", "--threads", dest="threads", type="int")
parser.set_defaults(
working_dir = "working"
,corpus_stem = "test"
,train_stem = "train.10k"
,nplm_home = "/home/bhaddow/tools/nplm"
,epoch=10
,ngram_size = 14
,minibatch_size=1000
,threads=8
)
options,args = parser.parse_args(sys.argv)
model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
model_file = model_prefix + "." + str(options.epoch)
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
vocab_file = options.working_dir + "/vocab"
#TODO: Get ngram size from info file.
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
ret = subprocess.call(prep_args)
if ret: raise Exception("Preparation failed")
test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
ret = subprocess.call(test_args)
if ret: raise Exception("Testing failed")
#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
if __name__ == "__main__":
main()

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
import logging
import optparse
import subprocess
import sys
def main():
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option("-w", "--working-dir", dest="working_dir")
parser.add_option("-c", "--corpus", dest="corpus_stem")
parser.add_option("-l", "--nplm-home", dest="nplm_home")
parser.add_option("-e", "--epochs", dest="epochs", type="int")
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
parser.add_option("-s", "--noise", dest="noise", type="int")
parser.add_option("-d", "--hidden", dest="hidden", type="int")
parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int")
parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int")
parser.add_option("-t", "--threads", dest="threads", type="int")
parser.add_option("-m", "--output-model", dest="output_model")
parser.set_defaults(
working_dir = "working"
,corpus_stem = "train.10k"
,nplm_home = "/home/bhaddow/tools/nplm"
,epochs = 10
,ngram_size = 14
,minibatch_size=1000
,noise=100
,hidden=750
,input_embedding=150
,output_embedding=150
,threads=8
,output_model = "train.10k"
)
options,args = parser.parse_args(sys.argv)
in_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
vocab_file = options.working_dir + "/vocab"
prep_file = options.working_dir + "/" + options.output_model + ".prepared"
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", in_file, "--ngram_size", \
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file ]
print "Prepare model command: "
print ', '.join(prep_args)
ret = subprocess.call(prep_args)
if ret: raise Exception("Prepare failed")
model_prefix = options.working_dir + "/" + options.output_model + ".model.nplm"
train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", prep_file, "--num_epochs", str(options.epochs),
"--input_words_file", vocab_file, "--output_words_file", vocab_file, "--model_prefix",
model_prefix, "--learning_rate", "1", "--minibatch_size", str(options.minibatch_size),
"--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension",
str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads",
str(options.threads)]
print "Train model command: "
print ', '.join(train_args)
ret = subprocess.call(train_args)
if ret: raise Exception("Training failed")
if __name__ == "__main__":
main()
#EPOCHS=10
#NGRAM_SIZE=14
#MINIBATCH_SIZE=1000
#NOISE=100
#HIDDEN=750
#INPUT_EMBEDDING=150
#OUTPUT_EMBEDDING=150
#THREADS=8
#
#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1
#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \
# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \
# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1