mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 15:04:05 +03:00
227 lines
7.9 KiB
Python
Executable File
227 lines
7.9 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
# Public License version 2.1 or, at your option, any later version.
|
|
|
|
"""Train feed-forward neural network LM with NPLM tool.
|
|
|
|
The resulting model can be used in Moses as feature function NeuralLM.
|
|
"""
|
|
|
|
from __future__ import print_function, unicode_literals
|
|
|
|
import logging
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
import os
|
|
import codecs
|
|
|
|
# ./bilingual-lm
|
|
sys.path.append(os.path.join(sys.path[0], 'bilingual-lm'))
|
|
import train_nplm
|
|
import averageNullEmbedding
|
|
|
|
|
|
logging.basicConfig(
|
|
format='%(asctime)s %(levelname)s: %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--working-dir", dest="working_dir", metavar="PATH")
|
|
parser.add_argument(
|
|
"--corpus", '-text', dest="corpus_stem", metavar="PATH",
|
|
help="Input file.")
|
|
parser.add_argument(
|
|
"--nplm-home", dest="nplm_home", metavar="PATH", required=True,
|
|
help="Location of NPLM.")
|
|
parser.add_argument(
|
|
"--epochs", dest="epochs", type=int, metavar="INT",
|
|
help="Number of training epochs (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--order", dest="order", type=int, metavar="INT",
|
|
help="N-gram order of language model (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--minibatch-size", dest="minibatch_size", type=int, metavar="INT",
|
|
help="Minibatch size (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--noise", dest="noise", type=int, metavar="INT",
|
|
help="Number of noise samples for NCE (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--hidden", dest="hidden", type=int, metavar="INT",
|
|
help=(
|
|
"Size of hidden layer (0 for single hidden layer) "
|
|
"(default: %(default)s)"))
|
|
parser.add_argument(
|
|
"--input-embedding", dest="input_embedding", type=int, metavar="INT",
|
|
help="Size of input embedding layer (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--output-embedding", dest="output_embedding", type=int, metavar="INT",
|
|
help="Size of output embedding layer (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--threads", "-t", dest="threads", type=int, metavar="INT",
|
|
help="Number of threads (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--output-model", dest="output_model", metavar="PATH",
|
|
help="Name of output model (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--output-dir", dest="output_dir", metavar="PATH",
|
|
help="Output directory (default: same as working-dir).")
|
|
parser.add_argument(
|
|
"--config-options-file", dest="config_options_file", metavar="PATH")
|
|
parser.add_argument(
|
|
"--log-file", dest="log_file", metavar="PATH",
|
|
help="Log file to write to (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--validation-corpus", dest="validation_corpus", metavar="PATH",
|
|
help="Validation file (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--activation-function", dest="activation_fn",
|
|
choices=['identity', 'rectifier', 'tanh', 'hardtanh'],
|
|
help="Activation function (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--learning-rate", dest="learning_rate", type=float, metavar="FLOAT",
|
|
help="Learning rate (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--words-file", dest="words_file", metavar="PATH",
|
|
help="Output vocabulary file (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--vocab-size", dest="vocab_size", type=int, metavar="INT",
|
|
help="Vocabulary size (default: %(default)s).")
|
|
parser.add_argument(
|
|
"--mmap", dest="mmap", action="store_true",
|
|
help="Use memory-mapped file (for lower memory consumption).")
|
|
|
|
parser.set_defaults(
|
|
working_dir="working",
|
|
corpus_stem="train",
|
|
nplm_home="/home/bhaddow/tools/nplm",
|
|
epochs=2,
|
|
order=5,
|
|
minibatch_size=1000,
|
|
noise=100,
|
|
hidden=0,
|
|
input_embedding=150,
|
|
output_embedding=750,
|
|
threads=4,
|
|
output_model="train",
|
|
output_dir=None,
|
|
config_options_file="config",
|
|
log_file="log",
|
|
validation_corpus=None,
|
|
activation_fn="rectifier",
|
|
learning_rate=1,
|
|
words_file='vocab',
|
|
vocab_size=500000)
|
|
|
|
|
|
def main(options):
|
|
|
|
options.ngram_size = options.order
|
|
|
|
if options.output_dir is None:
|
|
options.output_dir = options.working_dir
|
|
else:
|
|
# Create output dir if necessary
|
|
if not os.path.exists(options.output_dir):
|
|
os.makedirs(options.output_dir)
|
|
|
|
numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
|
|
train_file = numberized_file
|
|
if options.mmap:
|
|
train_file += '.mmap'
|
|
|
|
extraction_cmd = [
|
|
os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
|
|
'--train_text', options.corpus_stem,
|
|
'--ngramize', '1',
|
|
'--ngram_size', str(options.ngram_size),
|
|
'--vocab_size', str(options.vocab_size),
|
|
'--write_words_file', os.path.join(
|
|
options.working_dir, options.words_file),
|
|
'--train_file', os.path.join(options.working_dir, numberized_file)
|
|
]
|
|
|
|
sys.stderr.write('extracting n-grams\n')
|
|
sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
|
|
ret = subprocess.call(extraction_cmd)
|
|
if ret:
|
|
raise Exception("preparing neural LM failed")
|
|
|
|
if options.mmap:
|
|
try:
|
|
os.remove(os.path.join(options.working_dir, train_file))
|
|
except OSError:
|
|
pass
|
|
mmap_cmd = [
|
|
os.path.join(options.nplm_home, 'src', 'createMmap'),
|
|
'--input_file',
|
|
os.path.join(options.working_dir, numberized_file),
|
|
'--output_file',
|
|
os.path.join(options.working_dir, train_file)
|
|
]
|
|
sys.stderr.write('creating memory-mapped file\n')
|
|
sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
|
|
ret = subprocess.call(mmap_cmd)
|
|
if ret:
|
|
raise Exception("creating memory-mapped file failed")
|
|
|
|
if options.validation_corpus:
|
|
|
|
extraction_cmd = [
|
|
os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
|
|
'--train_text', options.validation_corpus,
|
|
'--ngramize', '1',
|
|
'--ngram_size', str(options.ngram_size),
|
|
'--vocab_size', str(options.vocab_size),
|
|
'--words_file', os.path.join(
|
|
options.working_dir, options.words_file),
|
|
'--train_file', os.path.join(
|
|
options.working_dir,
|
|
os.path.basename(options.validation_corpus) + '.numberized')
|
|
]
|
|
|
|
sys.stderr.write('extracting n-grams (validation file)\n')
|
|
sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
|
|
ret = subprocess.call(extraction_cmd)
|
|
if ret:
|
|
raise Exception("preparing neural LM failed")
|
|
|
|
else:
|
|
options.validation_file = None
|
|
|
|
options.input_words_file = options.words_file
|
|
options.output_words_file = options.words_file
|
|
options.input_vocab_size = options.vocab_size
|
|
options.output_vocab_size = options.vocab_size
|
|
|
|
sys.stderr.write('training neural network\n')
|
|
train_nplm.main(options)
|
|
|
|
sys.stderr.write('averaging null words\n')
|
|
average_options = averageNullEmbedding.parser.parse_args([
|
|
'-i', os.path.join(
|
|
options.output_dir,
|
|
options.output_model + '.model.nplm.' + str(options.epochs)),
|
|
'-o', os.path.join(
|
|
options.output_dir, options.output_model + '.model.nplm'),
|
|
'-t', os.path.join(options.working_dir, numberized_file),
|
|
'-p', os.path.join(options.nplm_home, 'python'),
|
|
])
|
|
averageNullEmbedding.main(average_options)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if sys.version_info < (3, 0):
|
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
|
|
|
options = parser.parse_known_args()[0]
|
|
if parser.parse_known_args()[1]:
|
|
sys.stderr.write(
|
|
"Warning: unknown arguments: {0}\n".format(
|
|
parser.parse_known_args()[1]))
|
|
main(options)
|