RDLM training without editing bash scripts

This commit is contained in:
Rico Sennrich 2015-03-20 14:07:53 +00:00
parent 2271f295e6
commit b8ca33c34e
6 changed files with 243 additions and 193 deletions

View File

@ -1,10 +1,11 @@
RDLM: relational dependency language model
------------------------------------------
This is a language model for the string-to-tree decoder with a dependency grammar.
It should work with any corpus with projective dependency annotation in ConLL format,
converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
It depends on NPLM for neural network training and querying.
This is a language model for the string-to-tree decoder with a dependency
grammar. It should work with any corpus with projective dependency annotation in
ConLL format, converted into the Moses format with the script
mosesdecoder/scripts/training/wrappers/conll2mosesxml.py It depends on NPLM for
neural network training and querying.
Prerequisites
-------------
@ -16,20 +17,27 @@ Install NPLM and compile moses with it. See the instructions in the Moses docume
Training
--------
RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
If you have such a system, you can train RDLM on the target side of the same parallel corpus
that is used for training the translation model.
RDLM is designed for string-to-tree decoding with dependency annotation on the
target side. If you have such a system, you can train RDLM on the target side of
the same parallel corpus that is used for training the translation model.
To train the model on additional monolingual data, or test it on some held-out test/dev data,
parse and process it in the same way that the parallel corpus has been processed.
This includes tokenization, parsing, truecasing, compound splitting etc.
To train the model on additional monolingual data, or test it on some held-out
test/dev data, parse and process it in the same way that the parallel corpus has
been processed. This includes tokenization, parsing, truecasing, compound
splitting etc.
RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
RDLM is split into two neural network models, which can be trained with
`train_rdlm.py`. An example command for training follows:
./train_model_head.sh rdlm_head.nnlm working_dir_head
./train_model_label.sh rdlm_label.nnlm working_dir_label
mkdir working_dir_head
mkdir working_dir_label
./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100
./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
include the vocabulary size of the label model (depending on the number of
dependency relations in the grammar), the size of the models, and the number of
training epochs.
Decoding
--------
@ -37,7 +45,7 @@ Decoding
To use RDLM during decoding, add the following line to your moses.ini config:
[feature]
RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
RDLM path_head_lm=/path/to/output_directory/rdlm_head.model.nplm path_label_lm=/path/to/output_directory/rdlm_label.model.nplm context_up=2 context_left=3 context_right=0
[weight]
RDLM 0.1 0.1

View File

@ -9,17 +9,24 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse
# hack for python2/3 compatibility
from io import open
argparse.open = open
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
def parse_arguments():
def create_parser():
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
help='input file (default: standard input).')
parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
help='output file (default: standard output).')
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
choices=['label', 'head'], required=True)
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
@ -40,7 +47,7 @@ def parse_arguments():
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--ptkvz', action='store_true',
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
return parser.parse_args()
return parser
def escape_text(s):
@ -203,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.append(vocab.get(labels[i], 0))
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
sys.stdout.write(' '.join(map(str, int_list)) + '\n')
options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
@ -216,18 +223,11 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
def load_vocab(path):
v = {}
for i,line in enumerate(io.open(path, encoding="UTF-8")):
for i,line in enumerate(open(path, encoding="UTF-8")):
v[line.strip()] = i
return v
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
options = parse_arguments()
def main(options):
vocab = load_vocab(options.vocab)
if options.output_vocab is None:
@ -236,13 +236,17 @@ if __name__ == '__main__':
else:
output_vocab = load_vocab(options.output_vocab)
global start_head_idx
global start_label_idx
global stop_head_idx
global stop_label_idx
start_head_idx = vocab.get("<start_head>", 0)
start_label_idx = vocab.get("<start_label>", 0)
stop_head_idx = vocab.get("<stop_head>", 0)
stop_label_idx = vocab.get("<stop_label>", 0)
i = 0
for line in sys.stdin:
for line in options.input:
if i and not i % 50000:
sys.stderr.write('.')
if i and not i % 1000000:
@ -260,3 +264,14 @@ if __name__ == '__main__':
xml = ET.fromstring(line)
get_syntactic_ngrams(xml, options, vocab, output_vocab)
i += 1
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
parser = create_parser()
options = parser.parse_args()
main(options)

View File

@ -7,16 +7,19 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse
from collections import Counter
# hack for python2/3 compatibility
from io import open
argparse.open = open
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
def parse_arguments():
def create_parser():
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
@ -34,9 +37,7 @@ def parse_arguments():
parser.add_argument('--ptkvz', action="store_true",
help='special rule for German dependency trees: attach separable verb prefixes to verb')
args = parser.parse_args()
return args
return parser
def escape_text(s):
@ -48,7 +49,7 @@ def escape_text(s):
return s
# deterministic heuristic to get head of subtree
def get_head(xml):
def get_head(xml, args):
head = None
preterminal = None
for child in xml:
@ -70,11 +71,11 @@ def get_head(xml):
return head, preterminal
def get_vocab(xml):
def get_vocab(xml, args):
if len(xml):
head, preterminal = get_head(xml)
head, preterminal = get_head(xml, args)
if not head:
head = '<null>'
preterminal = '<null>'
@ -89,18 +90,13 @@ def get_vocab(xml):
for child in xml:
if not len(child):
continue
get_vocab(child)
get_vocab(child, args)
def main(args):
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
args = parse_arguments()
global heads
global preterminals
global nonterminals
heads = Counter()
preterminals = Counter()
@ -115,11 +111,8 @@ if __name__ == '__main__':
if line == '\n':
continue
# hack for older moses versions with inconsistent encoding of "|"
line = line.replace('&bar;', '&#124;')
xml = ET.fromstring(line)
get_vocab(xml)
get_vocab(xml, args)
i += 1
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
@ -127,27 +120,27 @@ if __name__ == '__main__':
for i in range(30):
special_tokens.append('<null_{0}>'.format(i))
f = io.open(args.output + '.special', 'w', encoding='UTF-8')
f = open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
f.write(item + '\n')
f.close()
f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
f = open(args.output + '.preterminals', 'w', encoding='UTF-8')
for item in sorted(preterminals, key=preterminals.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
f = open(args.output + '.nonterminals', 'w', encoding='UTF-8')
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
f = open(args.output + '.terminals', 'w', encoding='UTF-8')
for item in sorted(heads, key=heads.get, reverse=True):
f.write(item + '\n')
f.close()
f = io.open(args.output + '.all', 'w', encoding='UTF-8')
f = open(args.output + '.all', 'w', encoding='UTF-8')
special_tokens_set = set(special_tokens)
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
if item not in special_tokens:
@ -167,3 +160,16 @@ if __name__ == '__main__':
i += 1
f.write(item + '\n')
f.close()
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
parser = create_parser()
args = parser.parse_args()
main(args)

View File

@ -1,65 +0,0 @@
#!/bin/bash
if [ $# -eq 2 ]; then
OUTFILE=$1
WORKDIR=$2
else
echo "usage: $0 <outfile> <working_directory>"
exit 1
fi
NPLM=/path/to/nplm
MOSES_ROOT=/path/to/mosesdecoder
INFILE=/path/to/file/in/moses/xml/format
VALIDATIONFILE=/path/to/file/in/moses/xml/format
#TESTFILE1=/path/to/file/in/moses/xml/format
#TESTFILE2=/path/to/file/in/moses/xml/format
PREFIX=$(basename $OUTFILE)
EPOCHS=2
INPUT_VOCAB_SIZE=500000
OUTPUT_VOCAB_SIZE=500000
MINIBATCH_SIZE=1000
NOISE=100
HIDDEN=0
INPUT_EMBEDDING=150
OUTPUT_EMBEDDING=750
THREADS=4
MODE=head
UP_CONTEXT=2
LEFT_CONTEXT=3
RIGHT_CONTEXT=0
mkdir -p $WORKDIR
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
if [[ $TESTFILE1 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi
if [[ $TESTFILE2 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi

View File

@ -1,72 +0,0 @@
#!/bin/bash
if [ $# -eq 2 ]; then
OUTFILE=$1
WORKDIR=$2
else
echo "usage: $0 <outfile> <working_directory>"
exit 1
fi
NPLM=/path/to/nplm
MOSES_ROOT=/path/to/mosesdecoder
INFILE=/path/to/file/in/moses/xml/format
VALIDATIONFILE=/path/to/file/in/moses/xml/format
#TESTFILE1=/path/to/file/in/moses/xml/format
#TESTFILE2=/path/to/file/in/moses/xml/format
PREFIX=$(basename $OUTFILE)
EPOCHS=1
INPUT_VOCAB_SIZE=500000
OUTPUT_VOCAB_SIZE=75
MINIBATCH_SIZE=1000
NOISE=50
HIDDEN=0
INPUT_EMBEDDING=150
OUTPUT_EMBEDDING=750
THREADS=4
MODE=label
UP_CONTEXT=2
LEFT_CONTEXT=3
RIGHT_CONTEXT=0
mkdir -p $WORKDIR
python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals |
grep -v "^<null" |
grep -v "^<root" |
grep -v "^<start_head" |
grep -v "^<dummy" |
grep -v "^<head_head" |
grep -v "^<stop_head" |
head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
--num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
--input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
--learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
--input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
if [[ $TESTFILE1 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi
if [[ $TESTFILE2 ]]; then
python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
--mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
$NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
fi

View File

@ -0,0 +1,158 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import logging
import argparse
import subprocess
import sys
import os
import codecs
import copy
# ../bilingual-lm
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
import train_nplm
import extract_vocab
import extract_syntactic_ngrams
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
parser.set_defaults(
working_dir = "working"
,corpus_stem = "train"
,nplm_home = "/home/bhaddow/tools/nplm"
,epochs = 2
,up_context_size = 2
,left_context_size = 3
,right_context_size = 0
,minibatch_size=1000
,noise=100
,hidden=0
,mode='head'
,input_embedding=150
,output_embedding=750
,threads=4
,output_model = "train"
,output_dir = None
,config_options_file = "config"
,log_file = "log"
,validation_corpus = None
,activation_fn = "rectifier"
,learning_rate = 1
,input_words_file = None
,output_words_file = None
,input_vocab_size = 500000
,output_vocab_size = 500000
)
def prepare_vocabulary(options):
vocab_prefix = os.path.join(options.working_dir, 'vocab')
extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
extract_vocab.main(extract_vocab_options)
if options.input_words_file is None:
options.input_words_file = vocab_prefix + '.input'
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()
if options.input_vocab_size:
filtered_vocab = filtered_vocab[:options.input_vocab_size]
open(options.input_words_file,'w').writelines(filtered_vocab)
if options.output_words_file is None:
options.output_words_file = vocab_prefix + '.output'
if options.mode == 'label':
blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
orig = vocab_prefix + '.special'
filtered_vocab = open(orig).readlines()
orig = vocab_prefix + '.nonterminals'
filtered_vocab += open(orig).readlines()
filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist]
if options.output_vocab_size:
filtered_vocab = filtered_vocab[:options.output_vocab_size]
else:
orig = vocab_prefix + '.all'
filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
open(options.output_words_file,'w').writelines(filtered_vocab)
def main(options):
options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
if options.mode == 'head':
options.ngram_size += 2
elif options.mode == 'label':
options.ngram_size += 1
if options.input_words_file is None or options.output_words_file is None:
sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
prepare_vocabulary(options)
extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
'--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
'--vocab', options.input_words_file,
'--output_vocab', options.output_words_file,
'--right_context', str(options.right_context_size),
'--left_context', str(options.left_context_size),
'--up_context', str(options.up_context_size),
'--mode', options.mode
])
sys.stderr.write('extracting syntactic n-grams\n')
extract_syntactic_ngrams.main(extract_options)
if validation_corpus:
extract_options.input = options.validation_corpus
options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
extract_options.output = options.validation_file
sys.stderr.write('extracting syntactic n-grams (validation file)\n')
extract_syntactic_ngrams.main(extract_options)
sys.stderr.write('training neural network\n')
train_nplm.main(options)
sys.stderr.write('averaging null words\n')
ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
options.nplm_home,
os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
os.path.join(options.working_dir, options.corpus_stem + '.numberized'),
os.path.join(options.output_dir, options.output_model + '.model.nplm.')
])
if ret:
raise Exception("averaging null words failed")
if __name__ == "__main__":
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
options = parser.parse_args()
main(options)