mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
Barry's training scripts with some minor changes by me
This commit is contained in:
parent
97b7c766e4
commit
4af68a0d1a
9
scripts/nplm-training/README
Normal file
9
scripts/nplm-training/README
Normal file
@ -0,0 +1,9 @@
|
||||
Example usage:
|
||||
#create training and test corpus
|
||||
/home/abmayne/code/deepathon/nnjm/extract_training.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.clean.10k --target-language cs --source-language en --align corpus/europarl.clean.10k.align
|
||||
/home/abmayne/code/deepathon/nnjm/extract_test.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus corpus/europarl.test.10k --target-language cs --source-language en --align corpus/europarl.test.10k.align
|
||||
|
||||
#Train and test different language models with basic nplm training
|
||||
/home/abmayne/code/deepathon/nnjm/train_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.clean.10k --minibatch-size 128 --epochs 40 --output-model europarl.10k.bbn --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --hidden 0 --threads 1 --output-model europarl.10k.1layer
|
||||
/home/abmayne/code/deepathon/nnjm/test_nplm.py --working-dir /home/abmayne/experiments/2014-iwslt/nplm/en-cs10k --corpus europarl.test.10k --train-corpus europarl.10k.1layer --nplm-home /home/abmayne/code/deepathon/nplm_one_layer --threads 1
|
||||
|
116
scripts/nplm-training/extract.py
Executable file
116
scripts/nplm-training/extract.py
Executable file
@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from collections import Counter
|
||||
import heapq
|
||||
import logging
|
||||
import optparse
|
||||
import sys
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
BOS = "<s>"
|
||||
EOS = "</s>"
|
||||
UNK = "<unk>"
|
||||
|
||||
def replace_tags(tokens,tags,vocab):
|
||||
for i,t in enumerate(tokens):
|
||||
if not t in vocab:
|
||||
if i < len(tags):
|
||||
tokens[i] = tags[i]
|
||||
else:
|
||||
print "Error: missing tags for index i:", i
|
||||
print ' '.join(tokens)
|
||||
print ' '.join(tags)
|
||||
tokens[i] = UNK
|
||||
|
||||
def replace_unks(tokens,vocab):
|
||||
for i,t in enumerate(tokens):
|
||||
if not t in vocab:
|
||||
tokens[i] = UNK
|
||||
|
||||
|
||||
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,tlang, m, n, ofh):
|
||||
"""
|
||||
m - source context
|
||||
n - target context
|
||||
|
||||
returns set of tags used
|
||||
"""
|
||||
tags = set()
|
||||
sfh = open(corpus_stem + "." + slang)
|
||||
tfh = open(corpus_stem + "." + tlang)
|
||||
afh = open(align_file)
|
||||
fhs = [sfh,tfh,afh]
|
||||
if tagged_stem:
|
||||
fhs.append(open(tagged_stem + "." + slang))
|
||||
fhs.append(open(tagged_stem + "." + tlang))
|
||||
|
||||
count = 0
|
||||
ngrams = 0
|
||||
LOG.info("Extracting ngrams")
|
||||
for lines in zip(*fhs):
|
||||
stokens = lines[0][:-1].split()
|
||||
ttokens = lines[1][:-1].split()
|
||||
if tagged_stem:
|
||||
stags = lines[3][:-1].split()
|
||||
ttags = lines[4][:-1].split()
|
||||
tags.update(stags)
|
||||
tags.update(ttags)
|
||||
replace_tags(stokens,stags,svocab)
|
||||
replace_tags(ttokens,ttags,tvocab)
|
||||
else:
|
||||
replace_unks(stokens,svocab)
|
||||
replace_unks(ttokens,tvocab)
|
||||
# list aligns for each target
|
||||
# Note: align specifies source -> target
|
||||
target_aligns = [[] for t in range(len(ttokens))]
|
||||
for atoken in lines[2][:-1].split():
|
||||
spos,tpos = atoken.split("-")
|
||||
spos,tpos = int(spos), int(tpos)
|
||||
target_aligns[tpos].append(spos)
|
||||
|
||||
|
||||
for tpos,spos_list in enumerate(target_aligns):
|
||||
# Affiliation heuristics - see Devlin t al. p1371
|
||||
if not spos_list:
|
||||
#tpos has no alignment, look right, then left, then right-right, then left-left etc
|
||||
rpos = tpos+1
|
||||
lpos = tpos-1
|
||||
while rpos < len(ttokens) or lpos >= 0:
|
||||
if rpos < len(ttokens) and target_aligns[rpos]:
|
||||
spos_list = target_aligns[rpos]
|
||||
break
|
||||
if lpos >= 0 and target_aligns[lpos]:
|
||||
spos_list = target_aligns[lpos]
|
||||
break
|
||||
rpos += 1
|
||||
lpos -= 1
|
||||
|
||||
if not spos_list:
|
||||
raise Exception("No alignments in sentence \nSRC: " + lines[0][:-1] + "\nTGT: " + lines[1][:-1])
|
||||
spos = (max(spos_list) + min(spos_list)) / 2
|
||||
|
||||
|
||||
# source-context, target-context, predicted word
|
||||
for i in range(max(0,m-spos)):
|
||||
print>>ofh, BOS,
|
||||
#print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
|
||||
print>>ofh, " ".join([s for s in stokens[max(0,spos-m):spos+m+1]]),
|
||||
for i in range(max(0,spos+m+1-len(stokens))):
|
||||
print>>ofh, EOS,
|
||||
for i in range(max(0,n-(tpos+1))):
|
||||
print>>ofh, BOS,
|
||||
print>>ofh, " ".join([t for t in ttokens[max(0,tpos+1-n):tpos+1]]),
|
||||
print>>ofh
|
||||
ngrams += 1
|
||||
|
||||
|
||||
count += 1
|
||||
if count % 1000 == 0: sys.stderr.write(".")
|
||||
if count % 50000 == 0: sys.stderr.write(" [%d]\n" % count)
|
||||
ofh.close()
|
||||
sys.stderr.write("\n")
|
||||
LOG.info("Extracted %d ngrams" % ngrams)
|
||||
return tags
|
||||
|
||||
|
73
scripts/nplm-training/extract_test.py
Executable file
73
scripts/nplm-training/extract_test.py
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#
|
||||
# Create a test corpus, using a previously pruned vocabulary.
|
||||
#
|
||||
|
||||
import logging
|
||||
import optparse
|
||||
import os
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
import extract
|
||||
|
||||
def read_vocab(filename):
|
||||
vocab = set()
|
||||
for line in open(filename):
|
||||
vocab.add(line[:-1])
|
||||
return vocab
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option("-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option("-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
|
||||
|
||||
|
||||
parser.set_defaults(
|
||||
target_language = "en",
|
||||
source_language = "de",
|
||||
corpus_stem = "test",
|
||||
align_file = "test.align",
|
||||
working_dir = "working",
|
||||
)
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
if not os.path.exists(options.working_dir):
|
||||
LOG.error("Working directory '%s' not found" % working_dir)
|
||||
sys.exit(1)
|
||||
|
||||
m,n = None,None
|
||||
for line in open(options.working_dir + "/info"):
|
||||
name,value = line[:-1].split()
|
||||
if name == "m": m = int(value)
|
||||
if name == "n": n = int(value)
|
||||
if m == None or n == None:
|
||||
LOG.error("info file is incomplete")
|
||||
sys.exit(1)
|
||||
|
||||
svocab = read_vocab(options.working_dir + "/vocab.source")
|
||||
tvocab = read_vocab(options.working_dir + "/vocab.target")
|
||||
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
|
||||
extract.get_ngrams(options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
m,
|
||||
n,
|
||||
ofh)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
115
scripts/nplm-training/extract_training.py
Executable file
115
scripts/nplm-training/extract_training.py
Executable file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from collections import Counter
|
||||
import logging
|
||||
import optparse
|
||||
import os
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
import extract
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
def get_pruned_vocab(corpus,prune):
|
||||
counts = Counter()
|
||||
LOG.info("Reading vocabulary from %s" % corpus)
|
||||
lines = 0
|
||||
for line in open(corpus):
|
||||
for token in line[:-1].split():
|
||||
counts[token] += 1
|
||||
lines += 1
|
||||
if lines % 1000 == 0: sys.stderr.write(".")
|
||||
if lines % 50000 == 0: sys.stderr.write(" [%d]\n" % lines)
|
||||
sys.stderr.write("\n")
|
||||
LOG.info("Vocabulary size: %d" % len(counts))
|
||||
if prune:
|
||||
return set([c[0] for c in counts.most_common(prune)])
|
||||
else:
|
||||
return set(counts.keys())
|
||||
|
||||
def save_vocab(directory, filename, vocab):
|
||||
fh = open(directory + "/" + filename, "w")
|
||||
for word in vocab:
|
||||
print>>fh, word
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-e", "--target-language", type="string", dest="target_language")
|
||||
parser.add_option("-f", "--source-language", type="string", dest="source_language")
|
||||
parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
|
||||
parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
|
||||
parser.add_option("-a", "--align", type="string", dest="align_file")
|
||||
parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
|
||||
parser.add_option("-n", "--target-context", type="int", dest="n")
|
||||
parser.add_option("-m", "--source-context", type="int", dest="m")
|
||||
parser.add_option("-p", "--prune-vocab", type="int", dest="prune")
|
||||
|
||||
|
||||
parser.set_defaults(
|
||||
target_language = "en",
|
||||
source_language = "de",
|
||||
corpus_stem = "train.10k",
|
||||
align_file = "train.10k.align",
|
||||
n = 5,
|
||||
m = 4,
|
||||
working_dir = "working",
|
||||
prune=16000
|
||||
)
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
|
||||
if not os.path.exists(options.working_dir):
|
||||
os.makedirs(options.working_dir)
|
||||
else:
|
||||
LOG.warn("Directory %s already exists, re-using" % options.working_dir)
|
||||
|
||||
info_file = options.working_dir + "/info"
|
||||
if os.path.exists(info_file):
|
||||
for line in open(info_file):
|
||||
name,value = line[:-1].split()
|
||||
if name == "n" and int(value) != options.n or \
|
||||
name == "m" and int(value) != options.m:
|
||||
LOG.error("info file exists, but parameters do not match. Delete working directory and rerun")
|
||||
sys.exit(1)
|
||||
else:
|
||||
ifh = open(info_file,"w")
|
||||
print>>ifh,"m",options.m
|
||||
print>>ifh,"n",options.n
|
||||
ifh.close()
|
||||
|
||||
scorpus = options.corpus_stem + "." + options.source_language
|
||||
tcorpus = options.corpus_stem + "." + options.target_language
|
||||
|
||||
tvocab,svocab = None,None
|
||||
# Extract vocabulary, and prune, if required
|
||||
svocab = get_pruned_vocab(scorpus,options.prune)
|
||||
tvocab = get_pruned_vocab(tcorpus,options.prune)
|
||||
|
||||
|
||||
file_stem = os.path.basename(options.corpus_stem)
|
||||
ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
|
||||
ofh = open(ngram_file, "w")
|
||||
|
||||
tags = extract.get_ngrams(options.corpus_stem,
|
||||
options.align_file,
|
||||
options.tagged_stem,
|
||||
svocab,
|
||||
tvocab,
|
||||
options.source_language,
|
||||
options.target_language,
|
||||
options.m,
|
||||
options.n,
|
||||
ofh)
|
||||
|
||||
# Save vocabularies
|
||||
svocab.add(extract.BOS)
|
||||
tvocab.add(extract.EOS)
|
||||
save_vocab(options.working_dir, "vocab.source", svocab)
|
||||
save_vocab(options.working_dir, "vocab.target", tvocab)
|
||||
vocab = svocab.union(tvocab)
|
||||
vocab.update(tags)
|
||||
save_vocab(options.working_dir, "vocab", vocab)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
17
scripts/nplm-training/tag.sh
Executable file
17
scripts/nplm-training/tag.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/sh
|
||||
|
||||
WRAP_DIR=~/moses.new/scripts/training/wrappers/
|
||||
|
||||
|
||||
tagger=$WRAP_DIR/make-factor-en-pos.mxpost.perl
|
||||
lang=en
|
||||
for stem in test train.10k train.100k; do
|
||||
$tagger -mxpost /home/pkoehn/statmt/project/mxpost $stem.$lang $stem.tagged.$lang /tmp
|
||||
done
|
||||
|
||||
tagger=$WRAP_DIR/make-factor-de-pos.perl
|
||||
lang=de
|
||||
for stem in test train.10k train.100k; do
|
||||
$tagger $stem.$lang $stem.tagged.$lang /tmp
|
||||
done
|
||||
|
57
scripts/nplm-training/test_nplm.py
Executable file
57
scripts/nplm-training/test_nplm.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import optparse
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-w", "--working-dir", dest="working_dir")
|
||||
parser.add_option("-c", "--corpus", dest="corpus_stem")
|
||||
parser.add_option("-r", "--train-corpus", dest="train_stem")
|
||||
parser.add_option("-l", "--nplm-home", dest="nplm_home")
|
||||
parser.add_option("-e", "--epoch", dest="epoch", type="int")
|
||||
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
|
||||
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
|
||||
parser.add_option("-t", "--threads", dest="threads", type="int")
|
||||
|
||||
parser.set_defaults(
|
||||
working_dir = "working"
|
||||
,corpus_stem = "test"
|
||||
,train_stem = "train.10k"
|
||||
,nplm_home = "/home/bhaddow/tools/nplm"
|
||||
,epoch=10
|
||||
,ngram_size = 14
|
||||
,minibatch_size=1000
|
||||
,threads=8
|
||||
)
|
||||
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
|
||||
model_prefix = options.working_dir + "/" + options.train_stem + ".model.nplm"
|
||||
model_file = model_prefix + "." + str(options.epoch)
|
||||
test_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
|
||||
prep_file = options.working_dir + "/" + options.corpus_stem + ".prepared"
|
||||
vocab_file = options.working_dir + "/vocab"
|
||||
|
||||
#TODO: Get ngram size from info file.
|
||||
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", test_file, "--ngram_size",
|
||||
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file]
|
||||
ret = subprocess.call(prep_args)
|
||||
if ret: raise Exception("Preparation failed")
|
||||
|
||||
test_args = [options.nplm_home + "/src/testNeuralNetwork", "--test_file", prep_file, "--model_file",
|
||||
model_file , "--minibatch_size", str(options.minibatch_size), "--num_threads", str(options.threads)]
|
||||
ret = subprocess.call(test_args)
|
||||
if ret: raise Exception("Testing failed")
|
||||
|
||||
#$ROOT/src/prepareNeuralLM --train_text $TESTFILE1 --ngram_size $NGRAM_SIZE --ngramize 1 --vocab_size $INPUT_VOCAB_SIZE --words_file $WORKDIR/words --train_file $WORKDIR/ref.ngrams || exit 1
|
||||
|
||||
#$ROOT/src/testNeuralNetwork --test_file $WORKDIR/ref.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
89
scripts/nplm-training/train_nplm.py
Executable file
89
scripts/nplm-training/train_nplm.py
Executable file
@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import optparse
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
||||
parser = optparse.OptionParser("%prog [options]")
|
||||
parser.add_option("-w", "--working-dir", dest="working_dir")
|
||||
parser.add_option("-c", "--corpus", dest="corpus_stem")
|
||||
parser.add_option("-l", "--nplm-home", dest="nplm_home")
|
||||
parser.add_option("-e", "--epochs", dest="epochs", type="int")
|
||||
parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
|
||||
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
|
||||
parser.add_option("-s", "--noise", dest="noise", type="int")
|
||||
parser.add_option("-d", "--hidden", dest="hidden", type="int")
|
||||
parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int")
|
||||
parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int")
|
||||
parser.add_option("-t", "--threads", dest="threads", type="int")
|
||||
parser.add_option("-m", "--output-model", dest="output_model")
|
||||
|
||||
parser.set_defaults(
|
||||
working_dir = "working"
|
||||
,corpus_stem = "train.10k"
|
||||
,nplm_home = "/home/bhaddow/tools/nplm"
|
||||
,epochs = 10
|
||||
,ngram_size = 14
|
||||
,minibatch_size=1000
|
||||
,noise=100
|
||||
,hidden=750
|
||||
,input_embedding=150
|
||||
,output_embedding=150
|
||||
,threads=8
|
||||
,output_model = "train.10k"
|
||||
)
|
||||
|
||||
options,args = parser.parse_args(sys.argv)
|
||||
|
||||
in_file = options.working_dir + "/" + options.corpus_stem + ".ngrams"
|
||||
vocab_file = options.working_dir + "/vocab"
|
||||
prep_file = options.working_dir + "/" + options.output_model + ".prepared"
|
||||
|
||||
prep_args = [options.nplm_home + "/src/prepareNeuralLM", "--train_text", in_file, "--ngram_size", \
|
||||
str(options.ngram_size), "--ngramize", "0", "--words_file", vocab_file, "--train_file", prep_file ]
|
||||
print "Prepare model command: "
|
||||
print ', '.join(prep_args)
|
||||
|
||||
ret = subprocess.call(prep_args)
|
||||
if ret: raise Exception("Prepare failed")
|
||||
|
||||
model_prefix = options.working_dir + "/" + options.output_model + ".model.nplm"
|
||||
train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", prep_file, "--num_epochs", str(options.epochs),
|
||||
"--input_words_file", vocab_file, "--output_words_file", vocab_file, "--model_prefix",
|
||||
model_prefix, "--learning_rate", "1", "--minibatch_size", str(options.minibatch_size),
|
||||
"--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension",
|
||||
str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads",
|
||||
str(options.threads)]
|
||||
print "Train model command: "
|
||||
print ', '.join(train_args)
|
||||
|
||||
ret = subprocess.call(train_args)
|
||||
if ret: raise Exception("Training failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
#EPOCHS=10
|
||||
#NGRAM_SIZE=14
|
||||
#MINIBATCH_SIZE=1000
|
||||
#NOISE=100
|
||||
#HIDDEN=750
|
||||
#INPUT_EMBEDDING=150
|
||||
#OUTPUT_EMBEDDING=150
|
||||
#THREADS=8
|
||||
#
|
||||
|
||||
#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1
|
||||
|
||||
#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \
|
||||
# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \
|
||||
# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
|
||||
# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user