Added arrow based Moses training pipeline demonstration program to contrib.

This commit is contained in:
Ian Johnson 2013-03-06 13:37:41 +00:00
parent 0afd06cdbd
commit f2536cddff
25 changed files with 1005 additions and 0 deletions

3
.gitmodules vendored
View File

@ -0,0 +1,3 @@
[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
path = contrib/arrow-pipelines/python/libs/pypeline
url = git://github.com/ianj-als/pypeline.git

View File

@ -0,0 +1,32 @@
Arrow Based Moses Training Pipeline
===================================
To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
$ git submodule init
This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
$ cd libs/pypeline
$ python setup.py install
Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
Three environment variables need to be set before the manager.py script can be run, they are:
- MOSES_HOME : The directory where Moses has been cloned, or installed,
- IRSTLM : The installation directory of your IRSTLM, and
- GIZA_HOME : The installation directory of GIZA++.
The manager.py script takes four positional command-line arguments:
- The source language code,
- The target language code,
- The source corpus file. This file *must* be cleaned prior to use, and
- The target corpus file. This file *must* be cleaned prior to use.
For example, run the manager.py script with:
$ python manager.py en lt cleantrain.en cleantrain.lt

@ -0,0 +1 @@
Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb

View File

@ -0,0 +1,192 @@
import logging
import os
from concurrent.futures import Future, ThreadPoolExecutor
from functools import partial
from pypeline.helpers.parallel_helpers import eval_pipeline, \
cons_function_component, \
cons_wire, \
cons_split_wire, \
cons_unsplit_wire, \
cons_dictionary_wire
#
# Some logging please
#
FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
logging.basicConfig(format = FORMAT, level = logging.DEBUG)
logger = logging.getLogger("manager")
# Build the pipeline components
def build_components(components, configuration, executor):
pipeline_components = dict()
pipeline_configuration = dict()
for component_id, module_name in components.items():
logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
module = __import__(module_name, fromlist = ['configure', 'initialise'])
# Component builds its own configuration object
config_func = getattr(module, 'configure')
component_config = config_func(configuration)
pipeline_configuration.update(component_config)
# Now build the component
init_func = getattr(module, 'initialise')
component_function = init_func(component_config)
# A wrapper for the component's function that submits to the executor
def get_component_function_wrapper(inner_function, comp_id, mod_name):
def component_function_wrapper(a, s):
logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
(comp_id, mod_name, a, s))
return inner_function(a, s)
return component_function_wrapper
# Arrowize the component
component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
# And store
pipeline_components[component_id] = component
return pipeline_components, pipeline_configuration
# Go!
def main(src_lang, trg_lang, src_filename, trg_filename):
# Global configuration
# One day, this configuration shall be constructed from
# command line options, or a properties file.
configuration = {
'moses_installation_dir': os.environ['MOSES_HOME'],
'irstlm_installation_dir': os.environ['IRSTLM'],
'giza_installation_dir': os.environ['GIZA_HOME'],
'src_lang': src_lang,
'src_tokenisation_dir': './tokenisation',
'trg_lang': trg_lang,
'trg_tokenisation_dir': './tokenisation',
'segment_length_limit': 60,
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': './language-model',
'translation_model_directory': './translation-model',
'mert_working_directory': './mert',
'evaluation_data_size': 100,
'development_data_size': 100
}
# The modules to load
# In the future, the components shall be specified in some kind
# pipeline description file.
component_modules = {
'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
'cleanup': 'training.components.cleanup.cleanup',
'data_split': 'training.components.data_split.data_split',
'irstlm_build': 'training.components.irstlm_build.irstlm_build',
'model_training': 'training.components.model_training.model_training',
'mert': 'training.components.mert.mert'
}
# The thread pool
executor = ThreadPoolExecutor(max_workers = 3)
# Phew, build the required components
components, component_config = build_components(component_modules, configuration, executor)
#
# Wire up components
# Description of wiring should be, in the future, alongside the component
# specification in some kind of confuguration file. Components shall be
# declared then used, i.e., bind a component instance to a unique component
# identifier, then wire component instances together by identifier.
#
#
# Tokenisation of source and target...
#
# IRSTLM Build components
irstlm_build_component = cons_split_wire() >> \
(cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
components['irstlm_build']).second() >> \
cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
'trg_language_model_filename': b['compiled_lm_filename']})
# The complete tokenisation component
tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
irstlm_build_component.second() >> \
cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
'trg_filename': b['tokenised_trg_filename'],
'trg_language_model_filename': b['trg_language_model_filename']})
#
# Cleanup and Data Spliting...
#
#
# A function that clips off the last '.' delimited string
#
def clip_last_bit(filename):
bn = os.path.basename(filename)
directory = os.path.dirname(filename)
bits = bn.split(".")
bits.pop()
return os.path.join(directory, ".".join(bits))
cleanup_datasplit_component = components['cleanup'] >> \
cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
'trg_filename': a['cleaned_trg_filename']}) >> \
components['data_split'] >> \
cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
'eval_src_filename': a['eval_src_filename'],
'eval_trg_filename': a['eval_trg_filename']})
#
# Translation model training
#
translation_model_component = cons_split_wire() >> \
components['model_training'].first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': b['eval_src_filename']})
#
# The whole pipeline
#
pipeline = tokenisation_component >> \
cons_split_wire() >> \
(cleanup_datasplit_component >> translation_model_component).first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': clip_last_bit(t['development_data_filename']),
'trg_language_model_filename': b['trg_language_model_filename'],
'trg_language_model_order': 3,
'trg_language_model_type': 9}) >> \
components['mert']
#
# The input to the pipeline
#
value = {'src_filename': src_filename,
'trg_filename': trg_filename}
#
# Evaluate the pipeline
#
logger.info("Evaluating pipeline with input [%s]..." % value)
new_value = eval_pipeline(executor, pipeline, value, component_config)
#
# Wait for all components to finish
#
executor.shutdown(True)
logger.info("Pipeline evaluated to %s" % new_value)
if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

View File

@ -0,0 +1,11 @@
import subprocess
def cat(filename, content):
fh = open(filename, "w")
for line in content:
#print(line, file=fh)
print >> fh, line
fh.close()
def diff(filename1, filename2):
subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)

View File

@ -0,0 +1,125 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits[-1] = "clean"
return ".".join(bits)
def _filter_main(value, config):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,109 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print(l1, end='', file=ofh1)
print(l2, end='', file=ofh2)
def _filter_main(config, value):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
ifh1 = open(value['src_filename'], "r")
ifh2 = open(value['trg_filename'], "r")
ofh1 = open(value['cleaned_src_filename'], "w")
ofh2 = open(value['cleaned_trg_filename'], "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': value['cleaned_src_filename'],
'cleaned_trg_filename': value['cleaned_trg_filename']}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return cons_function_component(_filter_main)
if __name__ == '__main__':
import os
import tempfile
import training.components.shared.test as thelp
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_filename': src_filename[1] + ".clean",
'cleaned_trg_filename': trg_filename[1] + ".clean",
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
run_pipeline(box, box_config, box_eval)
thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,146 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['evaluate_size'] = args['evaluation_data_size']
result['development_size'] = args['development_data_size']
return result
def initialise(config):
def _copy(size, inp, ofh1, ofh2):
try:
while size != 0:
(l1, l2) = inp.next()
print >>ofh1, l1,
print >>ofh2, l2,
size -= 1
except StopIteration:
pass
def _make_split_filename(filename, data_set):
bits = filename.split(".")
last = bits.pop()
lang_code = bits.pop()
bits.append(last)
bits.append(data_set)
bits.append(lang_code)
new_filename = ".".join(bits)
return new_filename
def _splitter_main(value, config):
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
inp = iter(zip(ifh1, ifh2))
result = {}
for (data_set, size) in [
('devel', config['development_size']),
('eval', config['evaluate_size']),
('train', -1)
]:
output_src_filename = _make_split_filename(input_src_filename, data_set)
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
ofh1 = open(output_src_filename, "w")
ofh2 = open(output_trg_filename, "w")
_copy(size, inp, ofh1, ofh2)
result[data_set + '_src_filename'] = output_src_filename
result[data_set + '_trg_filename'] = output_trg_filename
return result
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _splitter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {
'evaluation_data_size': 7,
'development_data_size': 13,
}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'devel_src_expected': src_filename[1] + ".devel.expected",
'devel_trg_expected': trg_filename[1] + ".devel.expected",
'eval_src_expected': src_filename[1] + ".eval.expected",
'eval_trg_expected': trg_filename[1] + ".eval.expected",
'train_src_expected': src_filename[1] + ".train.expected",
'train_trg_expected': trg_filename[1] + ".train.expected",
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
for data_set in ['devel', 'eval', 'train']:
for lang in ['src', 'trg']:
filename = output[data_set + '_' + lang + '_filename']
filename_expected = box_eval[data_set + '_' + lang + '_expected']
thelp.diff(filename_expected, filename)
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line(range(50)))
thelp.cat(box_eval['trg_filename'], _line(range(50)))
#expected output:
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,106 @@
import os
import shutil
import subprocess
import tempfile
from pypeline.helpers.helpers import cons_function_component
def configure(args):
config = dict()
config['irstlm_install_directory'] = args['irstlm_installation_dir']
config['smoothing_method'] = args['irstlm_smoothing_method']
config['lm_directory'] = args['language_model_directory']
return config
def initialise(config):
def process(a, s):
# Create the LM directory if we need to
if os.path.exists(s['lm_directory']) is False:
os.makedirs(s['lm_directory'])
# The filename of the file to chew through
start_end_input_filename = a['input_filename']
if os.path.exists(start_end_input_filename) is False:
raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
# Derive the output file name for the add start-end marker processor
filename_bits = os.path.basename(start_end_input_filename).split(".")
filename_bits[2] = "sb";
start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the output file name of the LM build
filename_bits[2] = "lm"
lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the compiled LM file name
filename_bits[2] = "arpa"
compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# First thing to do is add start and end markers
start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
infile = open(start_end_input_filename, 'r')
outfile = open(start_end_output_filename, 'w')
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
if return_code:
raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
start_end_input_filename, start_end_output_filename, return_code)
# Next build the language model
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
try:
build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
"-i", start_end_output_filename,
"-t", tmp_dir,
"-p",
"-s", s['smoothing_method'],
"-o", lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
return_code = subprocess.check_call(build_lm_cmdline)
if return_code:
raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
finally:
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
# Compile the LM
lm_filename = lm_filename + ".gz"
compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
"--text", "yes",
lm_filename,
compiled_lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
return_code = subprocess.check_call(compile_lm_cmdline)
if return_code:
raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
output = {'add_start_end_filename': start_end_output_filename,
'lm_filename': lm_filename,
'compiled_lm_filename': compiled_lm_filename}
print "IRSTLM Build: Output = %s" % output
return output
return process
if __name__ == '__main__':
from pypeline.helpers.helpers import eval_pipeline
lm_dir = os.environ["PWD"]
configuration = {'irstlm_root': os.environ["IRSTLM"],
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': lm_dir}
component_config = configure(configuration)
component = initialise(component_config)
value = eval_pipeline(component,
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
component_config)
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
print "Target: %s" % target
if value != target:
raise Exception("Massive fail!")

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['mert_working_dir'] = args['mert_working_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['development_data_filename'])
lm_file = os.path.abspath(a['trg_language_model_filename'])
lm_order = int(a['trg_language_model_order'])
lm_type = int(a['trg_language_model_type'])
orig_moses_ini = os.path.abspath(a['moses_ini_file'])
if not os.path.exists(orig_moses_ini):
raise Exception, "Error: Input moses.ini does not exist"
workdir = os.path.abspath(config['mert_working_dir'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
moses_install_dir = os.path.abspath(config['moses_installation_dir'])
mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
bin_dir = os.path.join(moses_install_dir, 'bin')
moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
src_file = infilename + '.' + config['src_lang']
ref_file = infilename + '.' + config['trg_lang']
logfile = os.path.join(workdir, 'log')
#change lm configuration in moses ini
moses_ini = os.path.join(workdir, 'trained-moses.ini')
cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
cmd = cmd % locals()
os.system(cmd)
#the command
cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
new_mosesini = os.path.join(workdir, 'moses.ini')
if not os.path.exists(new_mosesini):
raise Exception, 'Failed MERT'
return {'moses_ini_file':new_mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.path.abspath('../../../../'),
'mert_working_dir':'../../../../../tuning'}
values = {'development_data_filename':'../../../../../corpus/tune',
'moses_ini_file':'../../../../../model/model/moses.ini',
'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
'trg_language_model_type':9,
'trg_language_model_order':4}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['external_bin_dir'] = args['giza_installation_dir']
result['model_directory'] = args['translation_model_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['training_data_filename'])
workdir = os.path.abspath(config['model_directory'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
src_lang = config['src_lang'].lower()
trg_lang = config['trg_lang'].lower()
external_bin = os.path.abspath(config['external_bin_dir'])
#create a dummy lm file
dummy_lmfile = workdir + os.sep + 'dummy.lm'
f = open(dummy_lmfile, 'w')
print >> f, "dummy lm file"
f.close()
logfile = workdir + os.sep + 'log'
#the command
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
if not os.path.exists(mosesini):
raise Exception, 'Failed training model'
return {'moses_ini_file':mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.environ['MOSES_HOME'],
'giza_installation_dir':os.environ['GIZA_HOME'],
'translation_model_directory':'model-dir'}
values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['src_tokenisation_dir'] = args['src_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['src_filename']
outfilename = Tokenizer.batch_tokenise(
config['src_lang'],
config['moses_installation_dir'],
infilename,
config['src_tokenisation_dir'])
return {'tokenised_src_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'de',
'src_tokenisation_dir':'tmptok',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'src_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,3 @@
asdfweoih
awfwoeijf awefo
what's this

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
import sys, os, subprocess
class Tokenizer:
@staticmethod
def batch_tokenise(lang, mosesdir, infilename, workdir):
print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
if not os.path.exists(workdir):
os.makedirs(workdir)
tok = Tokenizer(lang, mosesdir)
basefilename = os.path.basename(infilename)
outfilename = workdir + os.sep + basefilename + '.tok'
tok.file_tokenise(infilename, outfilename)
return outfilename
def __init__(self, lang, mosesdir):
self.arrows = None
self.lang = lang
#check the perl tokenizer is here
#path = os.path.dirname(os.path.abspath(__file__))
path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
self.perltok = path + os.sep + 'tokenizer.perl'
if not os.path.exists(path):
raise Exception, "Perl tokenizer does not exists"
def file_tokenise(self, infilename, outfilename):
cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
if __name__ == '__main__':
#do some test
pass

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['trg_lang'] = args['trg_lang']
result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['trg_filename']
outfilename = Tokenizer.batch_tokenise(
config['trg_lang'],
config['moses_installation_dir'],
infilename,
config['trg_tokenisation_dir'])
return {'tokenised_trg_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'trg_lang':'de',
'trg_tokenisation_dir':'tmptoktrg',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'trg_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()