Added PCL based arrow pipelining

2024-08-16 06:50:32 +03:00 · 2013-05-22 10:25:25 +01:00 · 2013-05-22 10:25:25 +01:00 · 166eb7d0e0
commit 166eb7d0e0
parent 2f38746f6a
46 changed files with 70404 additions and 811 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
-[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
-	path = contrib/arrow-pipelines/python/libs/pypeline
-	url = git://github.com/ianj-als/pypeline.git
+[submodule "contrib/arrow-pipelines/python/pcl"]
+	path = contrib/arrow-pipelines/python/pcl
+	url = git://github.com/ianj-als/pcl.git
--- a/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg
+++ b/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg
@ -0,0 +1,10 @@
+[Configuration]
+tokeniser.src.language = en
+tokeniser.src.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
+tokeniser.trg.language = lt
+tokeniser.trg.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
+tokeniser.moses.installation = /opt/moses
+
+[Inputs]
+src_filename = test_data/src_trg_tokenizer/cleantrain.en
+trg_filename = test_data/src_trg_tokenizer/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl
+++ b/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl
@ -0,0 +1,40 @@
+#
+# Import all of the components to be composed
+#
+import wrappers.tokenizer.tokenizer as tokeniser
+
+#
+# Component definition
+#
+#                 +---------+                +---------+                          +---------+                          +---------+
+# src_filename -->+         +--> filename -->+-- src --+--> tokenised_filename -->+---------+--> tokenised_filename -->+         +--> tokenised_src_filename
+#                 |         |                |         |                          |         |                          |         |
+# trg_filename -->+         +--> filename -->+---------+-------> filename ------->+-- trg --+--> tokenised_filename -->+         +--> tokenised_trg_filename
+#                 +---------+                +---------+                          +---------+                          +---------+
+#                                  Config: {language::String,              Config: {language::String,
+#                                           tokenisation_dir::String,               tokenisation_dir::String,
+#                                           moses_installation_dir::String}         moses_installation_dir::String}
+#
+component src_trg_tokeniser
+  inputs (src_filename), (trg_filename)
+  outputs (tokenised_src_filename), (tokenised_trg_filename)
+  configuration tokeniser.src.language,
+                tokeniser.src.tokenisation_dir,
+                tokeniser.trg.language,
+                tokeniser.trg.tokenisation_dir,
+                tokeniser.moses.installation
+  declare
+    src_tokeniser := new tokeniser with
+      tokeniser.src.language -> language,
+      tokeniser.src.tokenisation_dir -> tokenisation_dir,
+      tokeniser.moses.installation -> moses_installation_dir
+    trg_tokeniser := new tokeniser with
+      tokeniser.trg.language -> language,
+      tokeniser.trg.tokenisation_dir -> tokenisation_dir,
+      tokeniser.moses.installation -> moses_installation_dir
+  as
+    wire (src_filename -> filename),
+         (trg_filename -> filename) >>>
+    (src_tokeniser *** trg_tokeniser) >>>
+    wire (tokenised_filename -> tokenised_src_filename),
+         (tokenised_filename -> tokenised_trg_filename)
--- a/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en
+++ b/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en
--- a/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt
+++ b/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en
+++ b/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en
--- a/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt
+++ b/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg
+++ b/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg
@ -0,0 +1,15 @@
+[Configuration]
+model_training.max_segment_length = 20
+model_training.corpus.development_size = 4500
+model_training.corpus.evaluation_size = 5000
+model_training.src.language = en
+model_training.trg.language = lt
+model_training.method.alignment = grow-diag-final-and
+model_training.method.reordering = msd-bidirectional-fe
+model_training.moses.installation = /opt/moses
+model_training.giza.installation = /opt/moses/giza++-v1.0.7
+model_training.translation_model.dir = test_data/translation_model_training/translation_model
+
+[Inputs]
+src_filename = test_data/translation_model_training/cleantrain.en
+trg_filename = test_data/translation_model_training/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl
+++ b/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl
@ -0,0 +1,68 @@
+#
+# Import all of the components to be composed
+#
+import wrappers.cleanup.cleanup as cleanup
+import wrappers.data_split.data_split as data_split
+import wrappers.model_training.model_training as model_training
+
+#
+# Component definition
+#
+#        {cleaned_src_filename, {src_filename, {[devel|eval|train]_src_filename, {src_filename,    {moses_ini_file,
+#         cleaned_trg_filename}  trg_filename}  [devel|eval|train]_trg_filename}  trg_filename}     evaluation_data_filename}
+#                             |  |              |                                 | +-------+                              |
+#                   +-------+ |  | +-------+    |         +-------+               V | Model | {moses_ini_file} +-------+   V
+#                   | Clean | V  V | Data  |    V         |       +---------------->+ Train +----------------->+ Merge +----->
+# {src_filename, -->+       +----->+       +------------->+ Split |                 +-------+                  +---+---+
+#  trg_filename}    |  Up   |      | Split |              |       +---\  Config: {[src|trg]_language::String,      ^
+#                   +-------+      +-------+              +-------+   |           alignment_method::String,        |
+# Config: {segment_length::Int} Config: {development_size::Int,       |           reordering_method::String,       |
+#                                        evaluation_size::Int}        |           giza_installation_dir::String,   |
+#                                                                     |           model_directory::String}         |
+#                                                                     \--------------------------------------------/
+#
+component translation_model_training
+  inputs src_filename, trg_filename
+  outputs evaluation_data_filename, moses_ini_filename
+  configuration model_training.max_segment_length,
+                model_training.corpus.development_size,
+                model_training.corpus.evaluation_size,
+                model_training.src.language,
+                model_training.trg.language,
+                model_training.method.alignment,
+                model_training.method.reordering,
+                model_training.moses.installation,
+                model_training.giza.installation,
+                model_training.translation_model.dir
+  declare
+    cleanup := new cleanup with
+      model_training.max_segment_length -> segment_length_limit
+    data_split := new data_split with
+      model_training.corpus.development_size -> development_data_size,
+      model_training.corpus.evaluation_size -> evaluation_data_size
+    model_training := new model_training with
+      model_training.src.language -> source_language,
+      model_training.trg.language -> target_language,
+      model_training.method.alignment -> alignment_method,
+      model_training.method.reordering -> reordering_method,
+      model_training.moses.installation -> moses_installation_dir,
+      model_training.giza.installation -> giza_installation_dir,
+      model_training.translation_model.dir -> translation_model_directory
+  as
+    cleanup >>>
+    wire cleaned_src_filename -> src_filename,
+         cleaned_trg_filename -> trg_filename >>>
+    data_split >>>
+    wire devel_src_filename -> devel_src_filename,
+         eval_src_filename -> evaluation_data_filename,
+         train_trg_filename -> _,
+         train_src_filename -> _,
+         eval_trg_filename -> _,
+         devel_trg_filename -> devel_trg_filename >>>
+    ((wire devel_src_filename -> src_filename,
+           devel_trg_filename -> trg_filename,
+           evaluation_data_filename -> _ >>>
+      model_training) &&&
+     wire evaluation_data_filename -> evaluation_data_filename) >>>
+    merge top[moses_ini_filename] -> moses_ini_filename,
+          bottom[evaluation_data_filename] -> evaluation_data_filename
--- a/contrib/arrow-pipelines/pcl/components/wrappers/init.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/init.py
--- a/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/init.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/init.py
--- a/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py
@ -0,0 +1,129 @@
+def get_name():
+    return 'cleanup'
+
+def get_inputs():
+    return ['src_filename', 'trg_filename']
+
+def get_outputs():
+    return ['cleaned_src_filename', 'cleaned_trg_filename']
+
+def get_configuration():
+    return ['segment_length_limit']
+
+def configure(args):
+    return {'segment_length' : args['segment_length_limit']}
+
+def initialise(config):
+    def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+        def _short(line):
+            n = 0
+            for c in line:
+                if c == " ":
+                    n += 1
+            return n < limit
+
+        for (l1, l2) in zip(ifh1, ifh2):
+            if _short(l1) and _short(l2):
+                print >>ofh1, l1,
+                print >>ofh2, l2,
+
+    def _make_cleaned_filename(filename):
+        bits = filename.split(".")
+        bits.insert(-1, "clean")
+        return ".".join(bits)
+
+    def _filter_main(a, s):
+        limit = config['segment_length']
+        (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+        try:
+            input_src_filename = a['src_filename']
+            input_trg_filename = a['trg_filename']
+
+            print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+            ifh1 = open(input_src_filename, "r")
+            ifh2 = open(input_trg_filename, "r")
+
+            cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+            cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+            ofh1 = open(cleaned_src_filename, "w")
+            ofh2 = open(cleaned_trg_filename, "w")
+
+            _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+            return {'cleaned_src_filename': cleaned_src_filename,
+                    'cleaned_trg_filename': cleaned_trg_filename}
+        finally:
+            def _safe_close(fh):
+                if fh is not None:
+                    fh.close()
+                _safe_close(ifh1)
+                _safe_close(ifh2)
+                _safe_close(ofh1)
+                _safe_close(ofh2)
+    
+    return _filter_main
+
+
+if __name__ == '__main__':
+    import os
+    import tempfile
+    import test.test as thelp
+
+    from pypeline.helpers.helpers import eval_pipeline
+
+
+    def _test_main():
+        configuration = {'segment_length_limit': 20}
+
+        src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+        trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+        box_eval = {
+            'src_filename': src_filename[1],
+            'trg_filename': trg_filename[1],
+            'cleaned_src_file_expected': src_filename[1] + ".expected",
+            'cleaned_trg_file_expected': trg_filename[1] + ".expected"}
+
+        try:
+            _prep_files(box_eval)
+            _run_test(configuration, box_eval)
+        finally:
+            _cleanup_files(box_eval)
+
+
+    def _run_test(configuration, box_eval):
+        box_config = configure(configuration)
+        box = initialise(box_config)
+    
+        output = eval_pipeline(box, box_eval, box_config)
+        try:
+            thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+            thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+        finally:
+            os.unlink(output['cleaned_src_filename'])
+            os.unlink(output['cleaned_trg_filename'])
+
+
+    def _line(line_lengths):
+        def _gen_line(tokens):
+            return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+        return map(_gen_line, line_lengths)
+
+
+    def _prep_files(box_eval):
+        thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+        thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+        thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+        thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+        def _cleanup_files(box_eval):
+            try:
+                for key, filename in box_eval.items():
+                    os.unlink(filename)
+            except:
+                pass
+
+
+    _test_main()
--- a/contrib/arrow-pipelines/pcl/components/wrappers/data_split/init.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/data_split/init.py
--- a/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg
@ -0,0 +1,7 @@
+[Configuration]
+evaluation_data_size = 7
+development_data_size = 13
+
+[Inputs]
+src_filename = test_data/data.en
+trg_filename = test_data/data.de
--- a/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py
@ -0,0 +1,144 @@
+def get_name():
+    return 'data_split'
+
+def get_inputs():
+    return ['src_filename', 'trg_filename']
+
+def get_outputs():
+    return ['devel_src_filename', 'devel_trg_filename',
+            'eval_src_filename', 'eval_trg_filename',
+            'train_src_filename', 'train_trg_filename']
+
+def get_configuration():
+    return ['evaluation_data_size', 'development_data_size']
+
+def configure(args):
+    result = {}
+    result['evaluate_size'] = args['evaluation_data_size']
+    result['development_size'] = args['development_data_size']
+    return result
+
+def initialise(config):
+    def _copy(size, inp, ofh1, ofh2):
+        try:
+            while size != 0:
+                (l1, l2) = inp.next()
+                print >>ofh1, l1,
+                print >>ofh2, l2,
+                size -= 1
+        except StopIteration:
+            pass
+
+    def _make_split_filename(filename, data_set):
+        bits = filename.split(".")
+        bits.insert(-1, data_set)
+
+        new_filename = ".".join(bits)
+        return new_filename
+
+    def _splitter_main(a, s):
+        (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+        try:
+            input_src_filename = a['src_filename']
+            input_trg_filename = a['trg_filename']
+
+            ifh1 = open(input_src_filename, "r")
+            ifh2 = open(input_trg_filename, "r")
+            inp = iter(zip(ifh1, ifh2))
+
+            result = {}
+            for (data_set, size) in [('devel', config['development_size']),
+                                     ('eval', config['evaluate_size']),
+                                     ('train', -1)]:
+                output_src_filename = _make_split_filename(input_src_filename, data_set)
+                output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+                ofh1 = open(output_src_filename, "w")
+                ofh2 = open(output_trg_filename, "w")
+
+                _copy(size, inp, ofh1, ofh2)
+                result[data_set + '_src_filename'] = output_src_filename
+                result[data_set + '_trg_filename'] = output_trg_filename
+
+            return result
+        finally:
+            def _safe_close(fh):
+                if fh is not None:
+                    fh.close()
+                _safe_close(ifh1)
+                _safe_close(ifh2)
+                _safe_close(ofh1)
+                _safe_close(ofh2)
+    
+    return _splitter_main
+
+
+if __name__ == '__main__':
+    import os
+    import tempfile
+    import test.test as thelp
+
+    from pypeline.helpers.helpers import eval_pipeline
+
+
+    def _test_main():
+        configuration = {'evaluation_data_size': 7,
+                         'development_data_size': 13}
+
+        src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+        trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+        box_eval = {'src_filename': src_filename[1],
+                    'trg_filename': trg_filename[1],
+                    'devel_src_expected': src_filename[1] + ".devel.expected",
+                    'devel_trg_expected': trg_filename[1] + ".devel.expected",
+                    'eval_src_expected': src_filename[1] + ".eval.expected",
+                    'eval_trg_expected': trg_filename[1] + ".eval.expected",
+                    'train_src_expected': src_filename[1] + ".train.expected",
+                    'train_trg_expected': trg_filename[1] + ".train.expected"}
+
+        try:
+            _prep_files(box_eval)
+            _run_test(configuration, box_eval)
+        finally:
+            _cleanup_files(box_eval)
+
+
+    def _run_test(configuration, box_eval):
+        box_config = configure(configuration)
+        box = initialise(box_config)
+    
+        output = eval_pipeline(box, box_eval, box_config)
+        for data_set in ['devel', 'eval', 'train']:
+            for lang in ['src', 'trg']:
+                filename = output[data_set + '_' + lang + '_filename']
+                filename_expected = box_eval[data_set + '_' + lang + '_expected']
+            thelp.diff(filename_expected, filename)
+
+
+    def _line(line_lengths):
+        def _gen_line(tokens):
+            return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+        return map(_gen_line, line_lengths)
+
+
+    def _prep_files(box_eval):
+        thelp.cat(box_eval['src_filename'], _line(range(50)))
+        thelp.cat(box_eval['trg_filename'], _line(range(50)))
+        #expected output:
+        thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+        thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+        thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+        thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+        thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+        thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+    def _cleanup_files(box_eval):
+        try:
+            for key, filename in box_eval.items():
+                os.unlink(filename)
+        except:
+            pass
+
+
+    _test_main()
--- a/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de
@ -0,0 +1,50 @@
+
+tok0
+tok0 tok1
+tok0 tok1 tok2
+tok0 tok1 tok2 tok3
+tok0 tok1 tok2 tok3 tok4
+tok0 tok1 tok2 tok3 tok4 tok5
+tok0 tok1 tok2 tok3 tok4 tok5 tok6
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
--- a/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en
@ -0,0 +1,50 @@
+
+tok0
+tok0 tok1
+tok0 tok1 tok2
+tok0 tok1 tok2 tok3
+tok0 tok1 tok2 tok3 tok4
+tok0 tok1 tok2 tok3 tok4 tok5
+tok0 tok1 tok2 tok3 tok4 tok5 tok6
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
+tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
--- a/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/init.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/init.py
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@ -3,20 +3,31 @@ import shutil
 import subprocess
 import tempfile

-from pypeline.helpers.helpers import cons_function_component
+
+def get_name():
+    return 'irstlm_build'
+
+def get_inputs():
+    return ['input_filename']
+
+def get_outputs():
+    return ['add_start_end_filename', 'lm_filename', 'compiled_lm_filename']
+
+def get_configuration():
+    return ['irstlm_installation_dir', 'irstlm_smoothing_method', 'language_model_directory']

 def configure(args):
    config = dict()
    config['irstlm_install_directory'] = args['irstlm_installation_dir']
    config['smoothing_method'] = args['irstlm_smoothing_method']
    config['lm_directory'] = args['language_model_directory']
-    return config
+    return config    

 def initialise(config):
    def process(a, s):
        # Create the LM directory if we need to
-        if os.path.exists(s['lm_directory']) is False:
-            os.makedirs(s['lm_directory'])
+        if os.path.exists(config['lm_directory']) is False:
+            os.makedirs(config['lm_directory'])

        # The filename of the file to chew through
        start_end_input_filename = a['input_filename']
@ -26,18 +37,18 @@ def initialise(config):
        # Derive the output file name for the add start-end marker processor
        filename_bits = os.path.basename(start_end_input_filename).split(".")
        filename_bits[2] = "sb";
-        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+        start_end_output_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))

        # Derive the output file name of the LM build
        filename_bits[2] = "lm"
-        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+        lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))

        # Derive the compiled LM file name
        filename_bits[2] = "arpa"
-        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+        compiled_lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))

        # First thing to do is add start and end markers
-        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        start_end_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "add-start-end.sh")]
        infile = open(start_end_input_filename, 'r')
        outfile = open(start_end_output_filename, 'w')
        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
@ -49,11 +60,11 @@ def initialise(config):
        # Next build the language model
        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
        try:
-            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+            build_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "build-lm.sh"),
                                "-i", start_end_output_filename,
                                "-t", tmp_dir,
                                "-p",
-                                "-s", s['smoothing_method'],
+                                "-s", config['smoothing_method'],
                                "-o", lm_filename]
            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
            return_code = subprocess.check_call(build_lm_cmdline)
@ -65,7 +76,7 @@ def initialise(config):

        # Compile the LM
        lm_filename = lm_filename + ".gz"
-        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+        compile_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "compile-lm"),
                              "--text", "yes",
                              lm_filename,
                              compiled_lm_filename]
@ -86,7 +97,7 @@ def initialise(config):


 if __name__ == '__main__':
-    from pypeline.helpers.helpers import eval_pipeline
+    from pypeline.helpers.helpers import eval_pipeline, cons_function_component

    lm_dir = os.environ["PWD"]
    configuration = {'irstlm_root': os.environ["IRSTLM"],
@ -95,7 +106,7 @@ if __name__ == '__main__':
    component_config = configure(configuration)
    component = initialise(component_config)

-    value = eval_pipeline(component,
+    value = eval_pipeline(cons_function_component(component),
                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
                          component_config)
    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
--- a/contrib/arrow-pipelines/python/training/components/data_split/init.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/init.py
--- a/contrib/arrow-pipelines/python/training/components/mert/mert.py
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@ -1,25 +1,38 @@
-#!/usr/bin/env python
+import os
+import shutil
+import subprocess

-import os, shutil, subprocess
+def get_name():
+    return 'mert'

-from pypeline.helpers.helpers import cons_function_component
+def get_inputs():
+    return ['evaluation_data_filename', 'trg_language_model_filename',
+            'trg_language_model_order', 'trg_language_model_type',
+            'moses_ini_filename']
+
+def get_outputs():
+    return ['moses_ini_filename']
+
+def get_configuration():
+    return ['source_language', 'target_language',
+            'moses_installation_dir', 'mert_working_directory']

 def configure(args):
    result = {}
-    result['src_lang'] = args['src_lang']
-    result['trg_lang'] = args['trg_lang']
+    result['src_lang'] = args['source_language']
+    result['trg_lang'] = args['target_language']
    result['moses_installation_dir'] = args['moses_installation_dir']
    result['mert_working_dir'] = args['mert_working_directory']
    return result

 def initialise(config):
-
    def process(a, s):
-        infilename = os.path.abspath(a['development_data_filename'])
+        infilename = os.path.abspath(a['evaluation_data_filename'])
+        infilename = ".".join(infilename.split(".")[:-1])
        lm_file = os.path.abspath(a['trg_language_model_filename'])
        lm_order = int(a['trg_language_model_order'])
        lm_type = int(a['trg_language_model_type'])
-        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_filename'])
        
        if not os.path.exists(orig_moses_ini):
            raise Exception, "Error: Input moses.ini does not exist"
@ -57,12 +70,12 @@ def initialise(config):
        if not os.path.exists(new_mosesini):
            raise Exception, 'Failed MERT'
        
-        return {'moses_ini_file':new_mosesini}
+        return {'moses_ini_filename' : new_mosesini}

    return process

-if __name__ == '__main__':

+if __name__ == '__main__':
    def __test():
        configuration = {'src_lang':'en',
                         'trg_lang':'lt',
@ -80,4 +93,3 @@ if __name__ == '__main__':

    #do some test
    __test()
-
--- a/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/init.py
--- a/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py
@ -0,0 +1,103 @@
+import os
+import shutil
+import subprocess
+
+
+def get_name():
+    return 'model_training'
+
+def get_inputs():
+    return ['src_filename', 'trg_filename']
+
+def get_outputs():
+    return ['moses_ini_filename']
+
+def get_configuration():
+    return ['source_language', 'target_language',
+            'moses_installation_dir', 'giza_installation_dir',
+            'translation_model_directory', 'alignment_method',
+            'reordering_method']
+
+# Alignment = grow-diag-final-and
+# Reordering = msd-bidirectional-fe
+def configure(args):
+    result = {}
+    result['src_lang'] = args['source_language']
+    result['trg_lang'] = args['target_language']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    result['alignment'] = args['alignment_method']
+    result['reordering'] = args['reordering_method']
+    return result
+
+def initialise(config):
+    def process(a, s):
+        get_corpora_name_fn = lambda fn: ".".join(os.path.basename(fn).split('.')[:-1])
+        src_filename = os.path.abspath(a['src_filename'])
+        trg_filename = os.path.abspath(a['trg_filename'])
+        src_corpora_name = get_corpora_name_fn(src_filename)
+        trg_corpora_name = get_corpora_name_fn(trg_filename)
+        if src_corpora_name != trg_corpora_name:
+            raise Exception, "Mismatch of source [%s] and target [%s] filename" % (src_filename, trg_filename)
+
+        infilename = os.path.abspath(os.path.join(os.path.dirname(src_filename), src_corpora_name))
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(os.path.join(config['moses_installation_dir'],
+                                                        'scripts',
+                                                        'training',
+                                                        'train-model.perl'))
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = os.path.join(workdir, 'dummy.lm')
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = os.path.join(workdir, 'log')
+        
+        #the command
+        alignment_method = config['alignment']
+        reordering_method = config['reordering']
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s ' \
+              '-f %(src_lang)s -e %(trg_lang)s -alignment %(alignment_method)s ' \
+              '-reordering %(reordering_method)s -lm 0:5:%(dummy_lmfile)s:0 ' \
+              '-external-bin-dir %(external_bin)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        # check the moses ini
+        mosesini = os.path.join(workdir, 'model', 'moses.ini')
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+
+        return {'moses_ini_filename' : mosesini}
+
+    return process
+
+
+if __name__ == '__main__':
+    def __test():
+        configuration = {'src_lang' : 'en',
+                         'trg_lang' : 'lt',
+                         'moses_installation_dir' : os.environ['MOSES_HOME'],
+                         'giza_installation_dir' : os.environ['GIZA_HOME'],
+                         'translation_model_directory' : 'model-dir'}
+        values = {'training_data_filename' : '/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
--- a/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/init.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/init.py
--- a/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en
--- a/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg
@ -0,0 +1,7 @@
+[Configuration]
+language = en
+tokenisation_dir = tokenised
+moses_installation_dir = /opt/moses
+
+[Inputs]
+filename = test_data/test.en
--- a/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.py
+++ b/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.py
@ -0,0 +1,58 @@
+import sys, os, subprocess
+
+
+class BatchTokenizer(object):
+    def __init__(self, language, working_dir, moses_installation_dir):
+        # Ensure the perl tokenizer is exists
+        self.__tokeniser = os.path.join(moses_installation_dir,
+                                        'scripts',
+                                        'tokenizer',
+                                        'tokenizer.perl')
+        if not os.path.exists(self.__tokeniser):
+            raise Exception("Perl tokenizer does not exist at [%s]" % self.__tokeniser)
+
+        self.__working_dir = working_dir
+        if not os.path.exists(self.__working_dir):
+            os.makedirs(self.__working_dir)
+        
+        self.__language = language
+
+    def tokenise(self, filename):
+        basefilename = os.path.basename(filename)
+        bits = basefilename.split(".")
+        bits.insert(-1, "tok")
+        basefilename = ".".join(bits)
+        outfilename = os.path.join(self.__working_dir, basefilename)
+        cmd = '%s -q -l %s < %s > %s' % (self.__tokeniser, self.__language, filename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        return outfilename
+
+
+def get_inputs():
+    return ['filename']
+
+def get_outputs():
+    return ['tokenised_filename']
+
+def get_configuration():
+    return ['language',
+            'tokenisation_dir',
+            'moses_installation_dir']
+
+def configure(args):
+    return {'language' : args['language'],
+            'tokenisation_dir' : args['tokenisation_dir'],
+            'moses_installation_dir' : args['moses_installation_dir']}
+
+def initialise(config):
+    tokenizer = BatchTokenizer(config['language'],
+                               config['tokenisation_dir'],
+                               config['moses_installation_dir'])
+
+    def process(a, s):
+        tokenised_filename = tokenizer.tokenise(a['filename'])
+        return {'tokenised_filename' : tokenised_filename}
+
+    return process
--- a/contrib/arrow-pipelines/pcl/test_data/cleantrain.en
+++ b/contrib/arrow-pipelines/pcl/test_data/cleantrain.en
--- a/contrib/arrow-pipelines/pcl/test_data/cleantrain.lt
+++ b/contrib/arrow-pipelines/pcl/test_data/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/training_pipeline.cfg
+++ b/contrib/arrow-pipelines/pcl/training_pipeline.cfg
@ -0,0 +1,20 @@
+[Configuration]
+source_language = en
+target_language = lt
+max_segment_length = 20
+corpus_development_size = 8000
+corpus_evaluation_size = 1000
+alignment_method = grow-diag-final-and
+reordering_method = msd-bidirectional-fe
+smoothing_method = improved-kneser-ney
+tokenisation_directory = test_data/tokenisation
+translation_model_directory = test_data/model
+language_model_directory = test_data/lm
+mert_directory = test_data/mert
+moses_installation_directory = /opt/moses
+giza_installation_directory = /opt/moses/giza++-v1.0.7
+irstlm_installation_directory = /opt/moses/irstlm-5.70.04
+
+[Inputs]
+src_filename = test_data/cleantrain.en
+trg_filename = test_data/cleantrain.lt
--- a/contrib/arrow-pipelines/pcl/training_pipeline.pcl
+++ b/contrib/arrow-pipelines/pcl/training_pipeline.pcl
@ -0,0 +1,115 @@
+#
+# Import all of the components to be composed
+#
+import components.src_trg_tokeniser as tokeniser
+import components.translation_model_training as model_training
+import components.wrappers.irstlm_build.irstlm_build as lang_model
+import components.wrappers.mert.mert as mert
+
+#
+# Component definition
+#
+#                                                        Config: {model_training.max_segment_length,
+#                                                                 model_training.corpus.[development_size|evaluation_size],
+#                                                                 model_training.[src|trg].language,
+#                                                                 model_training.method.[alignment|reordering],  {moses_ini_filename,
+#                                                                 model_training.giza.installation,               evaluation_data_filename}
+# {src_filename,    {tokenised_src_filename,                      model_training.translation_model.dir}           |
+#  trg_filename}     tokenised_trg_filename}             +-----------------------------------------+   +-------+  |          {moses_ini_filename}
+#  |  +-------+    +-------+    +-------+ |  +-------+   | tokenised_src_filename -> src_filename, |   | Model |  V    +-------+               |
+#  V  |       +--->+ Src/  +--->+       | V  |       +-->+ tokenised_trg_filename -> trg_filename  +-->+ Train +------>+       |      +------+ V
+# --->+ Split |    | Trg   |    | Merge +--->+ Split |   +-----------------------------------------+   +-------+       | Merge +----->+ MERT +--->
+#     |       +--->+ Token +--->+       |    |       +--\  +------------------------------------------+   +--------+   |       |  ^   +------+
+#     +-------+    +-------+    +-------+    +-------+  \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+       |  |
+# Config: {tokeniser.[src|trg].language,                   +------------------------------------------+   +--------+ ^ +-------+  |
+#          tokeniser.[src|trg].tokeniser_dir                             Config: {irstlm_installation_dir::String,   |            |
+#          tokeniser.moses.installation}                                          irstlm_smoothing_method::String,   |            |
+#                                                                                 language_model_directory}          |            |
+#                                                                                                                    |            |
+#                                                           {lm_filename, compiled_lm_filename, add_start_end_filename}           |
+#                                                                                                                                 |
+#                                                            {moses_ini_file, evaluation_data_filename, trg_language_model_filename,
+#                                                             trg_language_model_order, trg_language_model_type}
+#
+component training_pipeline
+  inputs src_filename, trg_filename
+  output moses_ini_filename
+  configuration source_language,
+                target_language,
+                max_segment_length,
+                corpus_development_size,
+                corpus_evaluation_size,
+                alignment_method,
+                reordering_method,
+                smoothing_method,
+                tokenisation_directory,
+                translation_model_directory,
+                language_model_directory,
+                mert_directory,
+                moses_installation_directory,
+                giza_installation_directory,
+                irstlm_installation_directory
+  declare
+    tokeniser := new tokeniser with
+      source_language -> tokeniser.src.language,
+      target_language -> tokeniser.trg.language,
+      tokenisation_directory -> tokeniser.src.tokenisation_dir,
+      tokenisation_directory -> tokeniser.trg.tokenisation_dir,
+      moses_installation_directory -> tokeniser.moses.installation
+    model_training := new model_training with
+      max_segment_length -> model_training.max_segment_length,
+      corpus_development_size -> model_training.corpus.development_size,
+      corpus_evaluation_size -> model_training.corpus.evaluation_size,
+      translation_model_directory -> model_training.translation_model.dir,
+      alignment_method -> model_training.method.alignment,
+      reordering_method -> model_training.method.reordering,
+      source_language -> model_training.src.language,
+      moses_installation_directory -> model_training.moses.installation,
+      giza_installation_directory -> model_training.giza.installation,
+      target_language -> model_training.trg.language
+    irstlm := new lang_model with
+      irstlm_installation_directory -> irstlm_installation_dir,
+      smoothing_method -> irstlm_smoothing_method,
+      language_model_directory -> language_model_directory
+    mert := new mert with
+      source_language -> source_language,
+      target_language -> target_language,
+      moses_installation_directory -> moses_installation_dir,
+      mert_directory -> mert_working_directory
+  as
+    # Split and transform the input to the tokeniser component
+    #  Inputs: src_filename, trg_filename
+    # Outputs: (tokenised_src_filename), (tokenised_trg_filename)
+    (wire src_filename -> src_filename,
+          trg_filename -> _ &&&
+     wire trg_filename -> trg_filename,
+          src_filename -> _) >>>
+    tokeniser >>>
+
+    # Merge output from tokeniser
+    #  Inputs: (tokenised_src_filename), (tokenised_trg_filename)
+    # Outputs: tokenised_src_filename, tokenised_trg_filename
+    merge top[tokenised_src_filename] -> tokenised_src_filename,
+          bottom[tokenised_trg_filename] -> tokenised_trg_filename >>>
+
+    # Train the translation table and target language model
+    #  Inputs: tokenised_src_filename, tokenised_trg_filename
+    # Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename')
+    ((wire tokenised_src_filename -> src_filename,
+           tokenised_trg_filename -> trg_filename >>> model_training) &&&
+     (wire tokenised_trg_filename -> input_filename,
+           tokenised_src_filename -> _ >>> irstlm)) >>>
+
+    # Merge the output from the TT and LM training component
+    #  Inputs: (moses_ini_filename, evaluation_data_filename),
+    #          (compiled_lm_filename, add_start_end_filename, lm_filename)
+    # Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename,
+    #          trg_language_model_filename, trg_language_model_order, trg_language_model_type
+    merge top[moses_ini_filename] -> moses_ini_filename,
+          top[evaluation_data_filename] -> evaluation_data_filename,
+          bottom[compiled_lm_filename] -> trg_language_model_filename,
+          bottom[add_start_end_filename] -> _,
+          bottom[lm_filename] -> _,
+          3 -> trg_language_model_order,
+          9 -> trg_language_model_type >>>
+    mert
--- a/contrib/arrow-pipelines/python/libs/pypeline
+++ b/contrib/arrow-pipelines/python/libs/pypeline
@ -1 +0,0 @@
-Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
--- a/contrib/arrow-pipelines/python/manager.py
+++ b/contrib/arrow-pipelines/python/manager.py
@ -1,192 +0,0 @@
-import logging
-import os
-
-from concurrent.futures import Future, ThreadPoolExecutor
-from functools import partial
-from pypeline.helpers.parallel_helpers import eval_pipeline, \
-    cons_function_component, \
-    cons_wire, \
-    cons_split_wire, \
-    cons_unsplit_wire, \
-    cons_dictionary_wire
-
-
-#
-# Some logging please
-#
-FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
-logging.basicConfig(format = FORMAT, level = logging.DEBUG)
-logger = logging.getLogger("manager")
-
-
-# Build the pipeline components
-def build_components(components, configuration, executor):
-  pipeline_components = dict()
-  pipeline_configuration = dict()
-
-  for component_id, module_name in components.items():
-    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
-
-    module = __import__(module_name, fromlist = ['configure', 'initialise'])
-    
-    # Component builds its own configuration object
-    config_func = getattr(module, 'configure')
-    component_config = config_func(configuration)
-    pipeline_configuration.update(component_config)
-
-    # Now build the component
-    init_func = getattr(module, 'initialise')
-    component_function = init_func(component_config)
-
-    # A wrapper for the component's function that submits to the executor
-    def get_component_function_wrapper(inner_function, comp_id, mod_name):
-      def component_function_wrapper(a, s):
-        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
-                    (comp_id, mod_name, a, s))
-        return inner_function(a, s)
-
-      return component_function_wrapper
-
-    # Arrowize the component
-    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
-
-    # And store
-    pipeline_components[component_id] = component
-
-  return pipeline_components, pipeline_configuration
-
-
-# Go!
-def main(src_lang, trg_lang, src_filename, trg_filename):
-  # Global configuration
-  # One day, this configuration shall be constructed from
-  # command line options, or a properties file.
-  configuration = {
-    'moses_installation_dir': os.environ['MOSES_HOME'],
-    'irstlm_installation_dir': os.environ['IRSTLM'],
-    'giza_installation_dir': os.environ['GIZA_HOME'],
-    'src_lang': src_lang,
-    'src_tokenisation_dir': './tokenisation',
-    'trg_lang': trg_lang,
-    'trg_tokenisation_dir': './tokenisation',
-    'segment_length_limit': 60,
-    'irstlm_smoothing_method': 'improved-kneser-ney',
-    'language_model_directory': './language-model',
-    'translation_model_directory': './translation-model',
-    'mert_working_directory': './mert',
-    'evaluation_data_size': 100,
-    'development_data_size': 100
-  }
-
-  # The modules to load
-  # In the future, the components shall be specified in some kind
-  # pipeline description file.
-  component_modules = {
-    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
-    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
-    'cleanup': 'training.components.cleanup.cleanup',
-    'data_split': 'training.components.data_split.data_split',
-    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
-    'model_training': 'training.components.model_training.model_training',
-    'mert': 'training.components.mert.mert'
-  }
-
-  # The thread pool
-  executor = ThreadPoolExecutor(max_workers = 3)
-
-  # Phew, build the required components
-  components, component_config = build_components(component_modules, configuration, executor)
-
-  #
-  # Wire up components
-  # Description of wiring should be, in the future, alongside the component
-  # specification in some kind of confuguration file. Components shall be
-  # declared then used, i.e., bind a component instance to a unique component
-  # identifier, then wire component instances together by identifier.
-  #
-
-  #
-  # Tokenisation of source and target...
-  #
-  # IRSTLM Build components
-  irstlm_build_component = cons_split_wire() >> \
-                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
-                            components['irstlm_build']).second() >> \
-                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
-                                                           'trg_language_model_filename': b['compiled_lm_filename']})
-
-  # The complete tokenisation component
-  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
-                           irstlm_build_component.second() >> \
-                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
-                                                           'trg_filename': b['tokenised_trg_filename'],
-                                                           'trg_language_model_filename': b['trg_language_model_filename']})
-
-  #
-  # Cleanup and Data Spliting...
-  #
-
-  #
-  # A function that clips off the last '.' delimited string
-  #
-  def clip_last_bit(filename):
-    bn = os.path.basename(filename)
-    directory = os.path.dirname(filename)
-    bits = bn.split(".")
-    bits.pop()
-    return os.path.join(directory, ".".join(bits))
-
-  cleanup_datasplit_component = components['cleanup'] >> \
-                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
-                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
-                                components['data_split'] >> \
-                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
-                                                        'eval_src_filename': a['eval_src_filename'],
-                                                        'eval_trg_filename': a['eval_trg_filename']})
-
-  #
-  # Translation model training
-  #
-  translation_model_component = cons_split_wire() >> \
-                                components['model_training'].first() >> \
-                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
-                                                                'development_data_filename': b['eval_src_filename']})
-
-  #
-  # The whole pipeline
-  #
-  pipeline = tokenisation_component >> \
-             cons_split_wire() >> \
-             (cleanup_datasplit_component >> translation_model_component).first() >> \
-             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
-                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
-                                             'trg_language_model_filename': b['trg_language_model_filename'],
-                                             'trg_language_model_order': 3,
-                                             'trg_language_model_type': 9}) >> \
-             components['mert']
-
-
-  #
-  # The input to the pipeline
-  #
-  value = {'src_filename': src_filename,
-           'trg_filename': trg_filename}
-
-  #
-  # Evaluate the pipeline
-  #
-  logger.info("Evaluating pipeline with input [%s]..." % value)
-  new_value = eval_pipeline(executor, pipeline, value, component_config)
-
-  #
-  # Wait for all components to finish
-  #
-  executor.shutdown(True)
-  
-  logger.info("Pipeline evaluated to %s" % new_value)
-
-
-if __name__ == '__main__':
-  import sys
-
-  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
--- a/contrib/arrow-pipelines/python/pcl
+++ b/contrib/arrow-pipelines/python/pcl
@ -0,0 +1 @@
+Subproject commit c5351ff941aeb9bb37a51f64228fad6f71ce4cab
--- a/contrib/arrow-pipelines/python/test/test.py
+++ b/contrib/arrow-pipelines/python/test/test.py
@ -1,11 +0,0 @@
-import subprocess
-
-def cat(filename, content):
-  fh = open(filename, "w")
-  for line in content:
-    #print(line, file=fh)
-    print >> fh, line
-  fh.close()
-
-def diff(filename1, filename2):
-  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@ -1,125 +0,0 @@
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-  result = {}
-  result['segment_length'] = args['segment_length_limit']
-  return result
-
-def initialise(config):
-  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
-    def _short(line):
-      n = 0
-      for c in line:
-        if c == " ":
-          n += 1
-      #print(line, ":", n)
-      return n < limit
-
-    for (l1, l2) in zip(ifh1, ifh2):
-      if _short(l1) and _short(l2):
-        print >>ofh1, l1,
-        print >>ofh2, l2,
-
-  def _make_cleaned_filename(filename):
-    bits = filename.split(".")
-    bits[-1] = "clean"
-    return ".".join(bits)
-
-  def _filter_main(value, config):
-    limit = config['segment_length']
-    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
-    try:
-      input_src_filename = value['src_filename']
-      input_trg_filename = value['trg_filename']
-
-      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
-
-      ifh1 = open(input_src_filename, "r")
-      ifh2 = open(input_trg_filename, "r")
-
-      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
-      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
-      ofh1 = open(cleaned_src_filename, "w")
-      ofh2 = open(cleaned_trg_filename, "w")
-
-      _filter(limit, ifh1, ofh1, ifh2, ofh2)
-
-      return {'cleaned_src_filename': cleaned_src_filename,
-              'cleaned_trg_filename': cleaned_trg_filename}
-    finally:
-      def _safe_close(fh):
-        if fh is not None:
-          fh.close()
-      _safe_close(ifh1)
-      _safe_close(ifh2)
-      _safe_close(ofh1)
-      _safe_close(ofh2)
-    
-  return _filter_main
-
-
-if __name__ == '__main__':
-  import os
-  import tempfile
-  import test.test as thelp
-
-  from pypeline.helpers.helpers import eval_pipeline
-
-
-  def _test_main():
-    configuration = {'segment_length_limit': 20}
-
-    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
-    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
-
-    box_eval = {
-      'src_filename': src_filename[1],
-      'trg_filename': trg_filename[1],
-      'cleaned_src_file_expected': src_filename[1] + ".expected",
-      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
-    }
-
-    try:
-      _prep_files(box_eval)
-      _run_test(configuration, box_eval)
-    finally:
-      _cleanup_files(box_eval)
-
-
-  def _run_test(configuration, box_eval):
-    box_config = configure(configuration)
-    box = initialise(box_config)
-    
-    output = eval_pipeline(box, box_eval, box_config)
-    try:
-      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
-      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
-    finally:
-      os.unlink(output['cleaned_src_filename'])
-      os.unlink(output['cleaned_trg_filename'])
-
-
-  def _line(line_lengths):
-    def _gen_line(tokens):
-      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
-    return map(_gen_line, line_lengths)
-
-
-  def _prep_files(box_eval):
-    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
-    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
-    #expected output:
-    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
-    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
-
-
-  def _cleanup_files(box_eval):
-    try:
-      for key, filename in box_eval.items():
-        os.unlink(filename)
-    except:
-      pass
-
-
-  _test_main()
-
--- a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@ -1,109 +0,0 @@
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-  result = {}
-  result['segment_length'] = args['segment_length_limit']
-  return result
-
-def initialise(config):
-  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
-    def _short(line):
-      n = 0
-      for c in line:
-        if c == " ":
-          n += 1
-      #print(line, ":", n)
-      return n < limit
-
-    for (l1, l2) in zip(ifh1, ifh2):
-      if _short(l1) and _short(l2):
-        print(l1, end='', file=ofh1)
-        print(l2, end='', file=ofh2)
-
-  def _filter_main(config, value):
-    limit = config['segment_length']
-    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
-    try:
-      ifh1 = open(value['src_filename'], "r")
-      ifh2 = open(value['trg_filename'], "r")
-      ofh1 = open(value['cleaned_src_filename'], "w")
-      ofh2 = open(value['cleaned_trg_filename'], "w")
-
-      _filter(limit, ifh1, ofh1, ifh2, ofh2)
-
-      return {'cleaned_src_filename': value['cleaned_src_filename'],
-              'cleaned_trg_filename': value['cleaned_trg_filename']}
-    finally:
-      def _safe_close(fh):
-        if fh is not None:
-          fh.close()
-      _safe_close(ifh1)
-      _safe_close(ifh2)
-      _safe_close(ofh1)
-      _safe_close(ofh2)
-    
-  return cons_function_component(_filter_main)
-
-
-if __name__ == '__main__':
-  import os
-  import tempfile
-  import training.components.shared.test as thelp
-
-
-  def _test_main():
-    configuration = {'segment_length_limit': 20}
-
-    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
-    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
-
-    box_eval = {
-      'src_filename': src_filename[1],
-      'trg_filename': trg_filename[1],
-      'cleaned_src_filename': src_filename[1] + ".clean",
-      'cleaned_trg_filename': trg_filename[1] + ".clean",
-      'cleaned_src_file_expected': src_filename[1] + ".expected",
-      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
-    }
-
-    try:
-      _prep_files(box_eval)
-      _run_test(configuration, box_eval)
-    finally:
-      _cleanup_files(box_eval)
-
-
-  def _run_test(configuration, box_eval):
-    from pypeline.helpers.helpers import run_pipeline
-    box_config = configure(configuration)
-    box = initialise(box_config)
-    
-    run_pipeline(box, box_config, box_eval)
-    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
-    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
-
-
-  def _line(line_lengths):
-    def _gen_line(tokens):
-      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
-    return map(_gen_line, line_lengths)
-
-
-  def _prep_files(box_eval):
-    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
-    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
-    #expected output:
-    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
-    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
-
-
-  def _cleanup_files(box_eval):
-    try:
-      for key, filename in box_eval.items():
-        os.unlink(filename)
-    except:
-      pass
-
-
-  _test_main()
-
--- a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@ -1,146 +0,0 @@
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-  result = {}
-  result['evaluate_size'] = args['evaluation_data_size']
-  result['development_size'] = args['development_data_size']
-  return result
-
-def initialise(config):
-
-  def _copy(size, inp, ofh1, ofh2):
-    try:
-      while size != 0:
-        (l1, l2) = inp.next()
-        print >>ofh1, l1,
-        print >>ofh2, l2,
-        size -= 1
-    except StopIteration:
-      pass
-
-  def _make_split_filename(filename, data_set):
-    bits = filename.split(".")
-    last = bits.pop()
-    lang_code = bits.pop()
-    
-    bits.append(last)
-    bits.append(data_set)
-    bits.append(lang_code)
-
-    new_filename = ".".join(bits)
-    return new_filename
-
-  def _splitter_main(value, config):
-    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
-    try:
-      input_src_filename = value['src_filename']
-      input_trg_filename = value['trg_filename']
-
-      ifh1 = open(input_src_filename, "r")
-      ifh2 = open(input_trg_filename, "r")
-      inp = iter(zip(ifh1, ifh2))
-
-      result = {}
-      for (data_set, size) in [
-        ('devel', config['development_size']),
-        ('eval', config['evaluate_size']),
-        ('train', -1)
-                ]:
-        output_src_filename = _make_split_filename(input_src_filename, data_set)
-        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
-        ofh1 = open(output_src_filename, "w")
-        ofh2 = open(output_trg_filename, "w")
-
-        _copy(size, inp, ofh1, ofh2)
-        result[data_set + '_src_filename'] = output_src_filename
-        result[data_set + '_trg_filename'] = output_trg_filename
-
-      return result
-
-    finally:
-      def _safe_close(fh):
-        if fh is not None:
-          fh.close()
-      _safe_close(ifh1)
-      _safe_close(ifh2)
-      _safe_close(ofh1)
-      _safe_close(ofh2)
-    
-  return _splitter_main
-
-
-if __name__ == '__main__':
-  import os
-  import tempfile
-  import test.test as thelp
-
-  from pypeline.helpers.helpers import eval_pipeline
-
-
-  def _test_main():
-    configuration = {
-      'evaluation_data_size': 7,
-      'development_data_size': 13,
-    }
-
-    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
-    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
-
-    box_eval = {
-      'src_filename': src_filename[1],
-      'trg_filename': trg_filename[1],
-      'devel_src_expected': src_filename[1] + ".devel.expected",
-      'devel_trg_expected': trg_filename[1] + ".devel.expected",
-      'eval_src_expected': src_filename[1] + ".eval.expected",
-      'eval_trg_expected': trg_filename[1] + ".eval.expected",
-      'train_src_expected': src_filename[1] + ".train.expected",
-      'train_trg_expected': trg_filename[1] + ".train.expected",
-    }
-
-    try:
-      _prep_files(box_eval)
-      _run_test(configuration, box_eval)
-    finally:
-      _cleanup_files(box_eval)
-
-
-  def _run_test(configuration, box_eval):
-    box_config = configure(configuration)
-    box = initialise(box_config)
-    
-    output = eval_pipeline(box, box_eval, box_config)
-    for data_set in ['devel', 'eval', 'train']:
-      for lang in ['src', 'trg']:
-        filename = output[data_set + '_' + lang + '_filename']
-        filename_expected = box_eval[data_set + '_' + lang + '_expected']
-      thelp.diff(filename_expected, filename)
-
-
-  def _line(line_lengths):
-    def _gen_line(tokens):
-      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
-    return map(_gen_line, line_lengths)
-
-
-  def _prep_files(box_eval):
-    thelp.cat(box_eval['src_filename'], _line(range(50)))
-    thelp.cat(box_eval['trg_filename'], _line(range(50)))
-    #expected output:
-    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
-    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
-    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
-    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
-    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
-    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
-
-
-  def _cleanup_files(box_eval):
-    try:
-      for key, filename in box_eval.items():
-        os.unlink(filename)
-    except:
-      pass
-
-
-  _test_main()
-
--- a/contrib/arrow-pipelines/python/training/components/model_training/init.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/init.py
--- a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@ -1,72 +0,0 @@
-#!/usr/bin/env python
-
-import os, shutil, subprocess
-
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-    result = {}
-    result['src_lang'] = args['src_lang']
-    result['trg_lang'] = args['trg_lang']
-    result['moses_installation_dir'] = args['moses_installation_dir']
-    result['external_bin_dir'] = args['giza_installation_dir']
-    result['model_directory'] = args['translation_model_directory']
-    return result
-
-def initialise(config):
-
-    def process(a, s):
-        infilename = os.path.abspath(a['training_data_filename'])
-        workdir = os.path.abspath(config['model_directory'])
-        #simply call the training perl script
-        #remove the workdir if it is already there
-        if os.path.exists(workdir):
-            shutil.rmtree(workdir)
-        os.makedirs(workdir)
-        
-        #local vars
-        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
-        src_lang = config['src_lang'].lower()
-        trg_lang = config['trg_lang'].lower()
-        external_bin = os.path.abspath(config['external_bin_dir'])
-        #create a dummy lm file
-        dummy_lmfile = workdir + os.sep + 'dummy.lm'
-        f = open(dummy_lmfile, 'w')
-        print >> f, "dummy lm file"
-        f.close()
-        logfile = workdir + os.sep + 'log'
-        
-        #the command
-        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
-
-        cmd = cmd % locals()
-
-        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
-        pipe.wait()
-
-        #check the moses ini
-        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
-        if not os.path.exists(mosesini):
-            raise Exception, 'Failed training model'
-        
-        return {'moses_ini_file':mosesini}
-
-    return process
-
-if __name__ == '__main__':
-
-    def __test():
-        configuration = {'src_lang':'en',
-                         'trg_lang':'lt',
-                         'moses_installation_dir':os.environ['MOSES_HOME'],
-                         'giza_installation_dir':os.environ['GIZA_HOME'],
-                         'translation_model_directory':'model-dir'}
-        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
-        from pypeline.helpers.helpers import run_pipeline
-        box_config = configure(configuration)
-        box = initialise(box_config)
-        print run_pipeline(box, values, None)
-
-    #do some test
-    __test()
-
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/init.py
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@ -1,43 +0,0 @@
-#!/usr/bin/env python
-
-import os
-
-from tokenizer import Tokenizer
-
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-    result = {}
-    result['src_lang'] = args['src_lang']
-    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
-    result['moses_installation_dir'] = args['moses_installation_dir']
-    return result
-
-def initialise(config):
-
-    def process(a, s):
-        infilename = a['src_filename']
-        outfilename = Tokenizer.batch_tokenise(
-            config['src_lang'], 
-            config['moses_installation_dir'], 
-            infilename, 
-            config['src_tokenisation_dir'])
-        return {'tokenised_src_filename':outfilename}
-
-    return process
-
-if __name__ == '__main__':
-
-    def __test():
-        configuration = {'src_lang':'de',
-                         'src_tokenisation_dir':'tmptok',
-                         'moses_installation_dir':os.path.abspath('../../../../')}
-        values = {'src_filename':'tmp.de'}
-        from pypeline.helpers.helpers import run_pipeline
-        box_config = configure(configuration)
-        box = initialise(configuration)
-        print run_pipeline(box, values, None)
-
-    #do some test
-    __test()
-
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@ -1,3 +0,0 @@
-asdfweoih
-awfwoeijf awefo
-what's this
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@ -1,36 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os, subprocess
-
-class Tokenizer:
-    
-    @staticmethod
-    def batch_tokenise(lang, mosesdir, infilename, workdir):
-        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
-        if not os.path.exists(workdir):
-            os.makedirs(workdir)
-        tok = Tokenizer(lang, mosesdir)
-        basefilename = os.path.basename(infilename)
-        outfilename = workdir + os.sep + basefilename + '.tok'
-        tok.file_tokenise(infilename, outfilename)
-        return outfilename
-        
-    def __init__(self, lang, mosesdir):
-        self.arrows = None
-        self.lang = lang
-        #check the perl tokenizer is here
-        #path = os.path.dirname(os.path.abspath(__file__))
-        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
-        self.perltok = path + os.sep + 'tokenizer.perl'
-        if not os.path.exists(path):
-            raise Exception, "Perl tokenizer does not exists"
-
-    def file_tokenise(self, infilename, outfilename):
-        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
-        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
-        pipe.wait()
-
-if __name__ == '__main__':
-    #do some test
-    pass
-
--- a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@ -1,43 +0,0 @@
-#!/usr/bin/env python
-
-import os
-
-from tokenizer import Tokenizer
-
-from pypeline.helpers.helpers import cons_function_component
-
-def configure(args):
-    result = {}
-    result['trg_lang'] = args['trg_lang']
-    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
-    result['moses_installation_dir'] = args['moses_installation_dir']
-    return result
-
-def initialise(config):
-
-    def process(a, s):
-        infilename = a['trg_filename']
-        outfilename = Tokenizer.batch_tokenise(
-            config['trg_lang'], 
-            config['moses_installation_dir'],
-            infilename, 
-            config['trg_tokenisation_dir'])
-        return {'tokenised_trg_filename':outfilename}
-
-    return process
-
-if __name__ == '__main__':
-
-    def __test():
-        configuration = {'trg_lang':'de',
-                         'trg_tokenisation_dir':'tmptoktrg',
-                         'moses_installation_dir':os.path.abspath('../../../../')}
-        values = {'trg_filename':'tmp.de'}
-        from pypeline.helpers.helpers import run_pipeline
-        box_config = configure(configuration)
-        box = initialise(configuration)
-        print run_pipeline(box, values, None)
-
-    #do some test
-    __test()
-
--- a/misc/processLexicalTable.cpp
+++ b/misc/processLexicalTable.cpp
@ -43,13 +43,16 @@ int main(int argc, char** argv)
    }
  }

+  bool success = false;
+
  if(inFilePath.empty()) {
    std::cerr << "processing stdin to " << outFilePath << ".*\n";
-    return LexicalReorderingTableTree::Create(std::cin, outFilePath);
+    success = LexicalReorderingTableTree::Create(std::cin, outFilePath);
  } else {
    std::cerr << "processing " << inFilePath<< " to " << outFilePath << ".*\n";
    InputFileStream file(inFilePath);
-    bool success = LexicalReorderingTableTree::Create(file, outFilePath);
-    return (success ? 0 : 1);
+    success = LexicalReorderingTableTree::Create(file, outFilePath);
  }
+
+  return (success ? 0 : 1);
 }
				`@ -1 +0,0 @@`
				`Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb`
				`@ -0,0 +1 @@`
				`Subproject commit c5351ff941aeb9bb37a51f64228fad6f71ce4cab`