Added arrow based Moses training pipeline demonstration program to contrib.

2024-08-16 15:00:33 +03:00 · 2013-03-06 13:37:41 +00:00 · 2013-03-06 13:37:41 +00:00 · f2536cddff
commit f2536cddff
parent 0afd06cdbd
25 changed files with 1005 additions and 0 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+	path = contrib/arrow-pipelines/python/libs/pypeline
+	url = git://github.com/ianj-als/pypeline.git
--- a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
+++ b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
--- a/contrib/arrow-pipelines/python/README
+++ b/contrib/arrow-pipelines/python/README
@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
--- a/contrib/arrow-pipelines/python/libs/pypeline
+++ b/contrib/arrow-pipelines/python/libs/pypeline
@ -0,0 +1 @@
+Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
--- a/contrib/arrow-pipelines/python/manager.py
+++ b/contrib/arrow-pipelines/python/manager.py
@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+    cons_function_component, \
+    cons_wire, \
+    cons_split_wire, \
+    cons_unsplit_wire, \
+    cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+  pipeline_components = dict()
+  pipeline_configuration = dict()
+
+  for component_id, module_name in components.items():
+    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+    module = __import__(module_name, fromlist = ['configure', 'initialise'])
+    
+    # Component builds its own configuration object
+    config_func = getattr(module, 'configure')
+    component_config = config_func(configuration)
+    pipeline_configuration.update(component_config)
+
+    # Now build the component
+    init_func = getattr(module, 'initialise')
+    component_function = init_func(component_config)
+
+    # A wrapper for the component's function that submits to the executor
+    def get_component_function_wrapper(inner_function, comp_id, mod_name):
+      def component_function_wrapper(a, s):
+        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+                    (comp_id, mod_name, a, s))
+        return inner_function(a, s)
+
+      return component_function_wrapper
+
+    # Arrowize the component
+    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+    # And store
+    pipeline_components[component_id] = component
+
+  return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+  # Global configuration
+  # One day, this configuration shall be constructed from
+  # command line options, or a properties file.
+  configuration = {
+    'moses_installation_dir': os.environ['MOSES_HOME'],
+    'irstlm_installation_dir': os.environ['IRSTLM'],
+    'giza_installation_dir': os.environ['GIZA_HOME'],
+    'src_lang': src_lang,
+    'src_tokenisation_dir': './tokenisation',
+    'trg_lang': trg_lang,
+    'trg_tokenisation_dir': './tokenisation',
+    'segment_length_limit': 60,
+    'irstlm_smoothing_method': 'improved-kneser-ney',
+    'language_model_directory': './language-model',
+    'translation_model_directory': './translation-model',
+    'mert_working_directory': './mert',
+    'evaluation_data_size': 100,
+    'development_data_size': 100
+  }
+
+  # The modules to load
+  # In the future, the components shall be specified in some kind
+  # pipeline description file.
+  component_modules = {
+    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+    'cleanup': 'training.components.cleanup.cleanup',
+    'data_split': 'training.components.data_split.data_split',
+    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+    'model_training': 'training.components.model_training.model_training',
+    'mert': 'training.components.mert.mert'
+  }
+
+  # The thread pool
+  executor = ThreadPoolExecutor(max_workers = 3)
+
+  # Phew, build the required components
+  components, component_config = build_components(component_modules, configuration, executor)
+
+  #
+  # Wire up components
+  # Description of wiring should be, in the future, alongside the component
+  # specification in some kind of confuguration file. Components shall be
+  # declared then used, i.e., bind a component instance to a unique component
+  # identifier, then wire component instances together by identifier.
+  #
+
+  #
+  # Tokenisation of source and target...
+  #
+  # IRSTLM Build components
+  irstlm_build_component = cons_split_wire() >> \
+                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
+                            components['irstlm_build']).second() >> \
+                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['compiled_lm_filename']})
+
+  # The complete tokenisation component
+  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+                           irstlm_build_component.second() >> \
+                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+                                                           'trg_filename': b['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['trg_language_model_filename']})
+
+  #
+  # Cleanup and Data Spliting...
+  #
+
+  #
+  # A function that clips off the last '.' delimited string
+  #
+  def clip_last_bit(filename):
+    bn = os.path.basename(filename)
+    directory = os.path.dirname(filename)
+    bits = bn.split(".")
+    bits.pop()
+    return os.path.join(directory, ".".join(bits))
+
+  cleanup_datasplit_component = components['cleanup'] >> \
+                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
+                                components['data_split'] >> \
+                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+                                                        'eval_src_filename': a['eval_src_filename'],
+                                                        'eval_trg_filename': a['eval_trg_filename']})
+
+  #
+  # Translation model training
+  #
+  translation_model_component = cons_split_wire() >> \
+                                components['model_training'].first() >> \
+                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                                                'development_data_filename': b['eval_src_filename']})
+
+  #
+  # The whole pipeline
+  #
+  pipeline = tokenisation_component >> \
+             cons_split_wire() >> \
+             (cleanup_datasplit_component >> translation_model_component).first() >> \
+             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
+                                             'trg_language_model_filename': b['trg_language_model_filename'],
+                                             'trg_language_model_order': 3,
+                                             'trg_language_model_type': 9}) >> \
+             components['mert']
+
+
+  #
+  # The input to the pipeline
+  #
+  value = {'src_filename': src_filename,
+           'trg_filename': trg_filename}
+
+  #
+  # Evaluate the pipeline
+  #
+  logger.info("Evaluating pipeline with input [%s]..." % value)
+  new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+  #
+  # Wait for all components to finish
+  #
+  executor.shutdown(True)
+  
+  logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+  import sys
+
+  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
--- a/contrib/arrow-pipelines/python/test/init.py
+++ b/contrib/arrow-pipelines/python/test/init.py
--- a/contrib/arrow-pipelines/python/test/test.py
+++ b/contrib/arrow-pipelines/python/test/test.py
@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+  fh = open(filename, "w")
+  for line in content:
+    #print(line, file=fh)
+    print >> fh, line
+  fh.close()
+
+def diff(filename1, filename2):
+  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
--- a/contrib/arrow-pipelines/python/training/init.py
+++ b/contrib/arrow-pipelines/python/training/init.py
--- a/contrib/arrow-pipelines/python/training/components/init.py
+++ b/contrib/arrow-pipelines/python/training/components/init.py
--- a/contrib/arrow-pipelines/python/training/components/cleanup/init.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/init.py
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+
+  def _make_cleaned_filename(filename):
+    bits = filename.split(".")
+    bits[-1] = "clean"
+    return ".".join(bits)
+
+  def _filter_main(value, config):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+
+      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+      ofh1 = open(cleaned_src_filename, "w")
+      ofh2 = open(cleaned_trg_filename, "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': cleaned_src_filename,
+              'cleaned_trg_filename': cleaned_trg_filename}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _filter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    try:
+      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+    finally:
+      os.unlink(output['cleaned_src_filename'])
+      os.unlink(output['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print(l1, end='', file=ofh1)
+        print(l2, end='', file=ofh2)
+
+  def _filter_main(config, value):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      ifh1 = open(value['src_filename'], "r")
+      ifh2 = open(value['trg_filename'], "r")
+      ofh1 = open(value['cleaned_src_filename'], "w")
+      ofh2 = open(value['cleaned_trg_filename'], "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': value['cleaned_src_filename'],
+              'cleaned_trg_filename': value['cleaned_trg_filename']}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import training.components.shared.test as thelp
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_filename': src_filename[1] + ".clean",
+      'cleaned_trg_filename': trg_filename[1] + ".clean",
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    from pypeline.helpers.helpers import run_pipeline
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    run_pipeline(box, box_config, box_eval)
+    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/data_split/init.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/init.py
--- a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['evaluate_size'] = args['evaluation_data_size']
+  result['development_size'] = args['development_data_size']
+  return result
+
+def initialise(config):
+
+  def _copy(size, inp, ofh1, ofh2):
+    try:
+      while size != 0:
+        (l1, l2) = inp.next()
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+        size -= 1
+    except StopIteration:
+      pass
+
+  def _make_split_filename(filename, data_set):
+    bits = filename.split(".")
+    last = bits.pop()
+    lang_code = bits.pop()
+    
+    bits.append(last)
+    bits.append(data_set)
+    bits.append(lang_code)
+
+    new_filename = ".".join(bits)
+    return new_filename
+
+  def _splitter_main(value, config):
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+      inp = iter(zip(ifh1, ifh2))
+
+      result = {}
+      for (data_set, size) in [
+        ('devel', config['development_size']),
+        ('eval', config['evaluate_size']),
+        ('train', -1)
+                ]:
+        output_src_filename = _make_split_filename(input_src_filename, data_set)
+        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+        ofh1 = open(output_src_filename, "w")
+        ofh2 = open(output_trg_filename, "w")
+
+        _copy(size, inp, ofh1, ofh2)
+        result[data_set + '_src_filename'] = output_src_filename
+        result[data_set + '_trg_filename'] = output_trg_filename
+
+      return result
+
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _splitter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {
+      'evaluation_data_size': 7,
+      'development_data_size': 13,
+    }
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'devel_src_expected': src_filename[1] + ".devel.expected",
+      'devel_trg_expected': trg_filename[1] + ".devel.expected",
+      'eval_src_expected': src_filename[1] + ".eval.expected",
+      'eval_trg_expected': trg_filename[1] + ".eval.expected",
+      'train_src_expected': src_filename[1] + ".train.expected",
+      'train_trg_expected': trg_filename[1] + ".train.expected",
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    for data_set in ['devel', 'eval', 'train']:
+      for lang in ['src', 'trg']:
+        filename = output[data_set + '_' + lang + '_filename']
+        filename_expected = box_eval[data_set + '_' + lang + '_expected']
+      thelp.diff(filename_expected, filename)
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line(range(50)))
+    thelp.cat(box_eval['trg_filename'], _line(range(50)))
+    #expected output:
+    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    config = dict()
+    config['irstlm_install_directory'] = args['irstlm_installation_dir']
+    config['smoothing_method'] = args['irstlm_smoothing_method']
+    config['lm_directory'] = args['language_model_directory']
+    return config
+
+def initialise(config):
+    def process(a, s):
+        # Create the LM directory if we need to
+        if os.path.exists(s['lm_directory']) is False:
+            os.makedirs(s['lm_directory'])
+
+        # The filename of the file to chew through
+        start_end_input_filename = a['input_filename']
+        if os.path.exists(start_end_input_filename) is False:
+            raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+        # Derive the output file name for the add start-end marker processor
+        filename_bits = os.path.basename(start_end_input_filename).split(".")
+        filename_bits[2] = "sb";
+        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the output file name of the LM build
+        filename_bits[2] = "lm"
+        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the compiled LM file name
+        filename_bits[2] = "arpa"
+        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # First thing to do is add start and end markers
+        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        infile = open(start_end_input_filename, 'r')
+        outfile = open(start_end_output_filename, 'w')
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+        return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+        if return_code:
+            raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+                            start_end_input_filename, start_end_output_filename, return_code)
+
+        # Next build the language model
+        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+        try:
+            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+                                "-i", start_end_output_filename,
+                                "-t", tmp_dir,
+                                "-p",
+                                "-s", s['smoothing_method'],
+                                "-o", lm_filename]
+            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+            return_code = subprocess.check_call(build_lm_cmdline)
+            if return_code: 
+                raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        # Compile the LM
+        lm_filename = lm_filename + ".gz"
+        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+                              "--text", "yes",
+                              lm_filename,
+                              compiled_lm_filename]
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+        return_code = subprocess.check_call(compile_lm_cmdline)
+        if return_code:
+            raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+        output = {'add_start_end_filename': start_end_output_filename,
+                  'lm_filename': lm_filename,
+                  'compiled_lm_filename': compiled_lm_filename}
+
+        print "IRSTLM Build: Output = %s" % output
+
+        return output
+
+    return process
+
+
+if __name__ == '__main__':
+    from pypeline.helpers.helpers import eval_pipeline
+
+    lm_dir = os.environ["PWD"]
+    configuration = {'irstlm_root': os.environ["IRSTLM"],
+                     'irstlm_smoothing_method': 'improved-kneser-ney',
+                     'language_model_directory': lm_dir}
+    component_config = configure(configuration)
+    component = initialise(component_config)
+
+    value = eval_pipeline(component,
+                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+                          component_config)
+    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+              'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+              'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+    print "Target: %s" % target
+    if value != target:
+        raise Exception("Massive fail!")
--- a/contrib/arrow-pipelines/python/training/components/mert/init.py
+++ b/contrib/arrow-pipelines/python/training/components/mert/init.py
--- a/contrib/arrow-pipelines/python/training/components/mert/mert.py
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['mert_working_dir'] = args['mert_working_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['development_data_filename'])
+        lm_file = os.path.abspath(a['trg_language_model_filename'])
+        lm_order = int(a['trg_language_model_order'])
+        lm_type = int(a['trg_language_model_type'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        
+        if not os.path.exists(orig_moses_ini):
+            raise Exception, "Error: Input moses.ini does not exist"
+
+        workdir = os.path.abspath(config['mert_working_dir'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+
+        #local vars
+        moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+        mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+        bin_dir = os.path.join(moses_install_dir, 'bin')
+        moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+        src_file = infilename + '.' + config['src_lang']
+        ref_file = infilename + '.' + config['trg_lang']
+        logfile = os.path.join(workdir, 'log')
+        #change lm configuration in moses ini
+        moses_ini = os.path.join(workdir, 'trained-moses.ini')
+        cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+        cmd = cmd % locals()
+        os.system(cmd)
+        
+        #the command
+        cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        new_mosesini = os.path.join(workdir, 'moses.ini')
+        if not os.path.exists(new_mosesini):
+            raise Exception, 'Failed MERT'
+        
+        return {'moses_ini_file':new_mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.path.abspath('../../../../'),
+                         'mert_working_dir':'../../../../../tuning'}
+        values = {'development_data_filename':'../../../../../corpus/tune',
+                  'moses_ini_file':'../../../../../model/model/moses.ini',
+                  'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+                  'trg_language_model_type':9,
+                  'trg_language_model_order':4}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/model_training/init.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/init.py
--- a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['training_data_filename'])
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = workdir + os.sep + 'dummy.lm'
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = workdir + os.sep + 'log'
+        
+        #the command
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+        
+        return {'moses_ini_file':mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.environ['MOSES_HOME'],
+                         'giza_installation_dir':os.environ['GIZA_HOME'],
+                         'translation_model_directory':'model-dir'}
+        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['src_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['src_lang'], 
+            config['moses_installation_dir'], 
+            infilename, 
+            config['src_tokenisation_dir'])
+        return {'tokenised_src_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'de',
+                         'src_tokenisation_dir':'tmptok',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'src_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+    
+    @staticmethod
+    def batch_tokenise(lang, mosesdir, infilename, workdir):
+        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+        if not os.path.exists(workdir):
+            os.makedirs(workdir)
+        tok = Tokenizer(lang, mosesdir)
+        basefilename = os.path.basename(infilename)
+        outfilename = workdir + os.sep + basefilename + '.tok'
+        tok.file_tokenise(infilename, outfilename)
+        return outfilename
+        
+    def __init__(self, lang, mosesdir):
+        self.arrows = None
+        self.lang = lang
+        #check the perl tokenizer is here
+        #path = os.path.dirname(os.path.abspath(__file__))
+        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+        self.perltok = path + os.sep + 'tokenizer.perl'
+        if not os.path.exists(path):
+            raise Exception, "Perl tokenizer does not exists"
+
+    def file_tokenise(self, infilename, outfilename):
+        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+if __name__ == '__main__':
+    #do some test
+    pass
+
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['trg_lang'] = args['trg_lang']
+    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['trg_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['trg_lang'], 
+            config['moses_installation_dir'],
+            infilename, 
+            config['trg_tokenisation_dir'])
+        return {'tokenised_trg_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'trg_lang':'de',
+                         'trg_tokenisation_dir':'tmptoktrg',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'trg_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
				`@ -0,0 +1 @@`
				`Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb`