From f2536cddffe5dbb141387fea0a27da19e1da21e2 Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian.johnson@appliedlanguage.com>
Date: Wed, 6 Mar 2013 13:37:41 +0000
Subject: [PATCH] Added arrow based Moses training pipeline demonstration
 program to contrib.

---
 .gitmodules                                   |   3 +
 .../training-pipeline/moses-pypeline.dia      | Bin 0 -> 3532 bytes
 contrib/arrow-pipelines/python/README         |  32 +++
 contrib/arrow-pipelines/python/libs/pypeline  |   1 +
 contrib/arrow-pipelines/python/manager.py     | 192 ++++++++++++++++++
 .../arrow-pipelines/python/test/__init__.py   |   0
 contrib/arrow-pipelines/python/test/test.py   |  11 +
 .../python/training/__init__.py               |   0
 .../python/training/components/__init__.py    |   0
 .../training/components/cleanup/__init__.py   |   0
 .../training/components/cleanup/cleanup.py    | 125 ++++++++++++
 .../training/components/cleanup/cleanup3.py   | 109 ++++++++++
 .../components/data_split/__init__.py         |   0
 .../components/data_split/data_split.py       | 146 +++++++++++++
 .../components/irstlm_build/__init__.py       |   0
 .../components/irstlm_build/irstlm_build.py   | 106 ++++++++++
 .../training/components/mert/__init__.py      |   0
 .../python/training/components/mert/mert.py   |  83 ++++++++
 .../components/model_training/__init__.py     |   0
 .../model_training/model_training.py          |  72 +++++++
 .../training/components/tokenizer/__init__.py |   0
 .../components/tokenizer/src_tokenizer.py     |  43 ++++
 .../training/components/tokenizer/tmp.de      |   3 +
 .../components/tokenizer/tokenizer.py         |  36 ++++
 .../components/tokenizer/trg_tokenizer.py     |  43 ++++
 25 files changed, 1005 insertions(+)
 create mode 100644 contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
 create mode 100644 contrib/arrow-pipelines/python/README
 create mode 160000 contrib/arrow-pipelines/python/libs/pypeline
 create mode 100644 contrib/arrow-pipelines/python/manager.py
 create mode 100644 contrib/arrow-pipelines/python/test/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/test/test.py
 create mode 100644 contrib/arrow-pipelines/python/training/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/data_split.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/mert/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/mert/mert.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/model_training/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/model_training/model_training.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py

diff --git a/.gitmodules b/.gitmodules
index e69de29bb..d3a8cb4da 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+	path = contrib/arrow-pipelines/python/libs/pypeline
+	url = git://github.com/ianj-als/pypeline.git
diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
new file mode 100644
index 0000000000000000000000000000000000000000..1d35a1dea7ad04f671561738f682da9aac844ec5
GIT binary patch
literal 3532
zcmV;-4Kwl|iwFP!000021MOW~Z{kQ6e(zs_=!bqW71zy|&ZI}`-5u>pJz8m!d6^gF
zKn1*Q?8r8q&aQTU`&QY81hBzSl+$KTq`p9kIxbWG&Udc$)6d@*(cqg%voMKoh8Wqy
zfrux`G>m6A!{6_I8F|B>KfU{D8U`QbpII6#2J#bere56)=Xw74@#^a7=?O*8S&%0w
ziozwz#MS?TC<?CRpsV4hcZ0#}38q0FsQ0ROgFH{e`(-W$aj+0K!~0<Jb(SW}cseXc
zRpTZ}l%#`iL3A^G|4{r4uc~RTHs;wg_hT><_o)cJHcxHKAKyP^>PL}oPrG<bvQQ4m
zpC5OIG|Z#^el?~VmB~Tz?9=-%BF{zozPzmJ&6oM=F1Arnz6jD;81KrG_a`cE20@(q
z>c_zzr#5lv>b@K=?!H@bQMcmKZpDS!_%TWIGzjxuk@rawi6Abe=IK(LE+?A=k=$bY
zmW5BB{UOZr<iPbm1X0#z0p;84C+^s%cb10J{SWWPh=y67!f8Gqe>Zfy%GsX{&HgRS
z!uv?nU40nmXPf@{JkxJ~I|YKjd|Y!kl(RLeh^Z&I;K!?o*)p7p?7%kL9Mdq<yc%|O
z=&o;%YkOnLx3&~^AtDN%MOsb&`9)O^sxP=goIx<j!*8osukJ~5|1UAgtHu5jB~O!i
zkmiHY;B)eQ_(DV^4GE_=!++YFcWZlnayAKsJGbjGS?{S4w#fGKcfq4+?)&@gsZ)9N
zK|G7ZYI<aI3nQOjJBZtaTU7r4G=U-#i}55$<6V+)mIW2_7a+2`s1<;?h;hRFmmgO{
z)?9?MdEP8c-Zp#Vhn*%Yn<r1>8rpbRA08v+F_=y_b>Y@>Y;@umpNsE1cWE0Jw#GEw
ziGmSsjN2lRx5+Y{h{0X*Rm7oW?DA=AV3T_wxJjQUab71|y+3#e7Gd-(FENO-;ULSO
z6;{Z{y!`0jMf6SN;UstsX~$o|&W@}7nmoI4+npkG6M*WTS%O^R6xe_n*Aup>n)cW*
zwyh^f62Doz5cS`vh?k2RoxM+M*D0PR<$3j@DNa`%_zbE8WAnPhE|mxI<o}6z<#Fng
z+G!6$Un>uvf%dp4?QtiqT#<wJ=vaF=*e$>Tv`1&!gXn4x>RTq22lb??unOJ5B)#w~
z)2ie2^|a9(6uowbfv&g!UGW!nE0&L-A57~9CS^L@0`<_LdZ4=MfpeqkfwQ%GaBM_;
z<Lcq`^|VnBTpAXy?Ft{1#bqgrzXo|QxP6SmT)qUAVOnKy7dZqpMwc3cox`9oZ&8%)
zD8>-?&AX#CmefvbaBx6fmY(>>Z?|_}ejR*XhS3z%gmKFPBi9AT(;4-|twh{ukWQtV
zxKoeBtJJ$nPjFp5fnAKKqe1vvX^XWK%K3Hmf+B2_9)Jh{hiTxb-Gi5a8@2)5Xv*Cw
zE@}Ijc)65ACh(DP^d*c%4Q+LF;YfL>X^|mg5RIe|$CE<1=aFm95yn_~#Hsw}G^vv$
zj>jUN77ICcdG+3rlkb51#wU;4mr$29cND{or{f?^lc(klMu_Ir<4DA_eFXhv%K0MV
zfC<oAg8IVco|1b%+tEU+-wfvRX{k^=Pl_PX;N`bBLvLpb)O;3KpQW3hb$4_0wl3&A
z^upE8itsFMG%Lch7-BZR9!Xo&b_ut*e02K2EB^B#ODE%pP;Ptqi4TL19|yO^uPphQ
z4|zJP``v$mjhJR5TU*vB4|%Cd0UmNr-15$l%|(%K6u6`STHuxhFq?xV1lY5??Z61?
z6nu2poxRD>c0JXGD4MSgb;K+~BNajO5Mg~|XOUdUHZxQ2=dV&4$I~zchSn)cl1>QP
z!vkQXDT0R2QL2TRS%QXX0bD*JMB0^@^#;0>@mL34>V1H&Hzs(P0e1BpyR>VCT%%Q{
zu7wNS!XBb-k?GkN*ean(OveqLF#ssz0JX6-!S~fic+xIL8Tl49$*fd<apAJ}RT2QW
zz5sZ3HXS&=`iMN*bq>UG3<3|ac!4`o3cwD)&hCx_)afJYFnx2(XsW!c(&SeL8CRYp
z+(VNL5)$6dB*PeNvVLQfcAhq)Mk_$&^d&}did>aNg?r=do~U}4W}i{v_+E_x5PP%Y
zkd6HdAl63^)6VXuu4OW0C#NqXcFaM(Z?WF6tW3<*eycK|6)ycAXYv~tQ$Qx;$h5gB
zAtKjDWYW%4XWXm}$oR?#<R0S8?UxZ)r4uZJnHVpUsfd6?y$y-tU<&pgtF&{|sb@Cx
zNzGsJC=c0QuQH#)G|Quf%4?EqO48@;+|5Oj<a7CIyb$?3nF4%u%CtO4e(HOLFV@aC
zR~^Qf=@MTl1pBKJUzc`R;#;@88lNtO65k6f@ufN?zRug!_!3+HiMDK`&$2a=Uz^Xb
zUHQzg!a42s#&in8Sn&;YDoUe!TF7-StBmY@<8qPZjU#}-7$-2B8y(EAACAAY=+v#9
zyzRNx$YlDq=cvwlBZ5@v2|=C$Kig<yJ&hH$YpmyTXsmZpjrD$2PQbxkrQAUAm{vTV
zk35$vB=9)&*6XOZ9??#u=hSGtQ%+y{=^QEzU8_G{x~cdkqU5oNd1m3iOBTBddA?@E
zV)xfTbItdArVtVBd6a=@r{a5#(S$QOeHjn+Ir3DQA}exZ``j3!sp@*YM|Xn`#qU#*
zq*IaBMkR^?Wyp0M(*T>lQ|je(lDGz7Xwc@@t`NXBGm3`Trt1?}u<au1xW><5(O6Tv
zGrA~tH;#Ag(g%RS2VhzeN<ywqU;zzeygjzfk;8pBfW?UM|3^6cBbaSPyVo>YyKciL
zNI7os$u8Vh`a?-kLa@{x4zdZha1ru8QDmCW#|H!5wJU4hy{!jgjf+mwaC6V?qdv`h
z@ff+mQgZTSnb!dNnbZYD=Fic58*7B-C4t=dFzG5Z@8}a(&oFWIDmxxyANj0Lg=1}_
z?u#T7**J{DVgy*<-Etof>c;x+i`1^p@Tu95doh2JVT5f-oV<e8c+BkujBUabpi_tF
zw6!k*o%-k;)UL-+sl_JS7Gdlv=O8bfg9I1OLD#f%@boqP_|8E{5CZ4m#Smm^b}6rk
z3(mpLorAu<b?z!4G+hj%#1N+tBNd2U_y-;1{=xdiHSgfI;~el0f`9P*{z0bWAM_0S
z2lwI-)h^&s2e>oUk-+S+n+!3fs9aT(x56PzOkz;&!uCK=+kx)MXM+I>0SYyv&^bW}
z({52nyRe3G$x<XSF?sBslA*|t$FHehRt007vfy?wRI{?CK?fm-`dUEU27mA#{ILOl
zE*XDlGlm>SAZDU~`LG9%YS(FCYS@EEUwQDDjXe3nxCf6K_23cYG5*@qLBNA|StWjd
z`|S?gcgEd!l7%or!F|_dVIQUX3d6B{qYA^ZyfUqZal{;I`MpW2DdW-eWFfYW7c7%$
zM4FGElJslQ6a$VQ>>6%1_AOMi^e~-<>Q}Q6vqwIw`O7n(u{3I_D=lC5&yo|kQ3+Vn
zFcsjgOSmK5oD>cDIp!J;QmzkoxClf#hNG-fjEb^4Wz#`ve<O^E<#y3@uzraRP66mU
zczMxFPp9!9c_Y7_c6Ebm_D~#`tc!A8j=1@f+|$?7&Sl5JNq1RJx?4!H>*NGBrb2U2
zf7^MA9n~%!DZe_ST7s#N@(3d)dET|>R#D~C3w~RLly}zEG~{62t><c$CqqCPi0Ph&
z*iK_+T#Jkr{5Bo2seVa_3i2{;HmF?5D9^OUypq_PQ-wBNXy6;`+BdelMZrJTseeoh
zSGR0-)@`nz{vjZUsgtmY;Zr~)G?r4Xv5#zGI}$BT*FDrOt>*djki$K%^*9a*$H33d
zjw7~)-;m+Ata@Pfc}zoMG~c*14oOw>Q1xc=JFyMP{&eGVkl&^wQ>#J5Xh}p%3$knU
zdI+Kn;OoaRFOSPt-mdvNQ^myL$OqG|mB6mq436Rq64g-tn%Ju00M`f`w#-*Pc}380
z03spKaG*24L1m9P5Dn=%8p4Ra*-a=nUg+x-Ewr49^j3zq*+Spig*9*9)&m_V^abK-
zfVko_+TGpJR_(e8%8b%hO4qcNIx1nuG-*A3JwIZ?4m8b&gq;gHX}&7B2a0-rirSE;
z&9uu>l&1l^Wtp@=B_YKSrw~)*X;W4v>^NQIY1c2YUavr!_T{x#(4u|xZL(pQb^!*#
zMp>VnzN|K*1kv)@>dE2o$yX7FnV62%k#dcPzOP5AvNAaM{Vn*!<_?BDnIE~1Y`+W5
zIZu}=ZoEeouINeCrHUJl8SgBsu|ZXf{#4MO3iwkO?WvB-CVrbLH>O>}F!GJU%}AxA
zluomegKXMCtoQU4wR4;Km~ef+7Q}kNiS}nEy(1TKR>A@J(Yo`aov--BaK8g7#`TpL
zhjQjzV@_5L54dkTraK<gF0y7EmsbRSv<pl7BaMEvE;5g`s~^_LYLwSWA-)nqN=tGi
z9~jTWzH8i;tXtlmQHLrZMp&m#1T0EGl06ost6@>vh8^;IQHa}TLm{&**Vi~ybX|Wz
zp$_`4*ROETxPy6N5{sbwI)G6(V02tHgXv=V2785T*D{O@pPwk_mxRZrD&p^B=X&HR
z2qHzq|I`Gl)^^I);_7{25bD2}XBjs<%f~;C<?AST7U`#V<qP>|mIjMY@BR-M%FsNC
G2>}2H;Sd7=

literal 0
HcmV?d00001

diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README
new file mode 100644
index 000000000..e1e12975c
--- /dev/null
+++ b/contrib/arrow-pipelines/python/README
@@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline
new file mode 160000
index 000000000..a7084b686
--- /dev/null
+++ b/contrib/arrow-pipelines/python/libs/pypeline
@@ -0,0 +1 @@
+Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py
new file mode 100644
index 000000000..1c3ece111
--- /dev/null
+++ b/contrib/arrow-pipelines/python/manager.py
@@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+    cons_function_component, \
+    cons_wire, \
+    cons_split_wire, \
+    cons_unsplit_wire, \
+    cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+  pipeline_components = dict()
+  pipeline_configuration = dict()
+
+  for component_id, module_name in components.items():
+    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+    module = __import__(module_name, fromlist = ['configure', 'initialise'])
+    
+    # Component builds its own configuration object
+    config_func = getattr(module, 'configure')
+    component_config = config_func(configuration)
+    pipeline_configuration.update(component_config)
+
+    # Now build the component
+    init_func = getattr(module, 'initialise')
+    component_function = init_func(component_config)
+
+    # A wrapper for the component's function that submits to the executor
+    def get_component_function_wrapper(inner_function, comp_id, mod_name):
+      def component_function_wrapper(a, s):
+        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+                    (comp_id, mod_name, a, s))
+        return inner_function(a, s)
+
+      return component_function_wrapper
+
+    # Arrowize the component
+    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+    # And store
+    pipeline_components[component_id] = component
+
+  return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+  # Global configuration
+  # One day, this configuration shall be constructed from
+  # command line options, or a properties file.
+  configuration = {
+    'moses_installation_dir': os.environ['MOSES_HOME'],
+    'irstlm_installation_dir': os.environ['IRSTLM'],
+    'giza_installation_dir': os.environ['GIZA_HOME'],
+    'src_lang': src_lang,
+    'src_tokenisation_dir': './tokenisation',
+    'trg_lang': trg_lang,
+    'trg_tokenisation_dir': './tokenisation',
+    'segment_length_limit': 60,
+    'irstlm_smoothing_method': 'improved-kneser-ney',
+    'language_model_directory': './language-model',
+    'translation_model_directory': './translation-model',
+    'mert_working_directory': './mert',
+    'evaluation_data_size': 100,
+    'development_data_size': 100
+  }
+
+  # The modules to load
+  # In the future, the components shall be specified in some kind
+  # pipeline description file.
+  component_modules = {
+    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+    'cleanup': 'training.components.cleanup.cleanup',
+    'data_split': 'training.components.data_split.data_split',
+    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+    'model_training': 'training.components.model_training.model_training',
+    'mert': 'training.components.mert.mert'
+  }
+
+  # The thread pool
+  executor = ThreadPoolExecutor(max_workers = 3)
+
+  # Phew, build the required components
+  components, component_config = build_components(component_modules, configuration, executor)
+
+  #
+  # Wire up components
+  # Description of wiring should be, in the future, alongside the component
+  # specification in some kind of confuguration file. Components shall be
+  # declared then used, i.e., bind a component instance to a unique component
+  # identifier, then wire component instances together by identifier.
+  #
+
+  #
+  # Tokenisation of source and target...
+  #
+  # IRSTLM Build components
+  irstlm_build_component = cons_split_wire() >> \
+                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
+                            components['irstlm_build']).second() >> \
+                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['compiled_lm_filename']})
+
+  # The complete tokenisation component
+  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+                           irstlm_build_component.second() >> \
+                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+                                                           'trg_filename': b['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['trg_language_model_filename']})
+
+  #
+  # Cleanup and Data Spliting...
+  #
+
+  #
+  # A function that clips off the last '.' delimited string
+  #
+  def clip_last_bit(filename):
+    bn = os.path.basename(filename)
+    directory = os.path.dirname(filename)
+    bits = bn.split(".")
+    bits.pop()
+    return os.path.join(directory, ".".join(bits))
+
+  cleanup_datasplit_component = components['cleanup'] >> \
+                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
+                                components['data_split'] >> \
+                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+                                                        'eval_src_filename': a['eval_src_filename'],
+                                                        'eval_trg_filename': a['eval_trg_filename']})
+
+  #
+  # Translation model training
+  #
+  translation_model_component = cons_split_wire() >> \
+                                components['model_training'].first() >> \
+                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                                                'development_data_filename': b['eval_src_filename']})
+
+  #
+  # The whole pipeline
+  #
+  pipeline = tokenisation_component >> \
+             cons_split_wire() >> \
+             (cleanup_datasplit_component >> translation_model_component).first() >> \
+             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
+                                             'trg_language_model_filename': b['trg_language_model_filename'],
+                                             'trg_language_model_order': 3,
+                                             'trg_language_model_type': 9}) >> \
+             components['mert']
+
+
+  #
+  # The input to the pipeline
+  #
+  value = {'src_filename': src_filename,
+           'trg_filename': trg_filename}
+
+  #
+  # Evaluate the pipeline
+  #
+  logger.info("Evaluating pipeline with input [%s]..." % value)
+  new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+  #
+  # Wait for all components to finish
+  #
+  executor.shutdown(True)
+  
+  logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+  import sys
+
+  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py
new file mode 100644
index 000000000..628796f7d
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/test.py
@@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+  fh = open(filename, "w")
+  for line in content:
+    #print(line, file=fh)
+    print >> fh, line
+  fh.close()
+
+def diff(filename1, filename2):
+  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
new file mode 100644
index 000000000..cb2e057ce
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+
+  def _make_cleaned_filename(filename):
+    bits = filename.split(".")
+    bits[-1] = "clean"
+    return ".".join(bits)
+
+  def _filter_main(value, config):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+
+      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+      ofh1 = open(cleaned_src_filename, "w")
+      ofh2 = open(cleaned_trg_filename, "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': cleaned_src_filename,
+              'cleaned_trg_filename': cleaned_trg_filename}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _filter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    try:
+      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+    finally:
+      os.unlink(output['cleaned_src_filename'])
+      os.unlink(output['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
new file mode 100644
index 000000000..27625c612
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print(l1, end='', file=ofh1)
+        print(l2, end='', file=ofh2)
+
+  def _filter_main(config, value):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      ifh1 = open(value['src_filename'], "r")
+      ifh2 = open(value['trg_filename'], "r")
+      ofh1 = open(value['cleaned_src_filename'], "w")
+      ofh2 = open(value['cleaned_trg_filename'], "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': value['cleaned_src_filename'],
+              'cleaned_trg_filename': value['cleaned_trg_filename']}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import training.components.shared.test as thelp
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_filename': src_filename[1] + ".clean",
+      'cleaned_trg_filename': trg_filename[1] + ".clean",
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    from pypeline.helpers.helpers import run_pipeline
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    run_pipeline(box, box_config, box_eval)
+    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
new file mode 100644
index 000000000..b8469cbf6
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['evaluate_size'] = args['evaluation_data_size']
+  result['development_size'] = args['development_data_size']
+  return result
+
+def initialise(config):
+
+  def _copy(size, inp, ofh1, ofh2):
+    try:
+      while size != 0:
+        (l1, l2) = inp.next()
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+        size -= 1
+    except StopIteration:
+      pass
+
+  def _make_split_filename(filename, data_set):
+    bits = filename.split(".")
+    last = bits.pop()
+    lang_code = bits.pop()
+    
+    bits.append(last)
+    bits.append(data_set)
+    bits.append(lang_code)
+
+    new_filename = ".".join(bits)
+    return new_filename
+
+  def _splitter_main(value, config):
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+      inp = iter(zip(ifh1, ifh2))
+
+      result = {}
+      for (data_set, size) in [
+        ('devel', config['development_size']),
+        ('eval', config['evaluate_size']),
+        ('train', -1)
+                ]:
+        output_src_filename = _make_split_filename(input_src_filename, data_set)
+        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+        ofh1 = open(output_src_filename, "w")
+        ofh2 = open(output_trg_filename, "w")
+
+        _copy(size, inp, ofh1, ofh2)
+        result[data_set + '_src_filename'] = output_src_filename
+        result[data_set + '_trg_filename'] = output_trg_filename
+
+      return result
+
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _splitter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {
+      'evaluation_data_size': 7,
+      'development_data_size': 13,
+    }
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'devel_src_expected': src_filename[1] + ".devel.expected",
+      'devel_trg_expected': trg_filename[1] + ".devel.expected",
+      'eval_src_expected': src_filename[1] + ".eval.expected",
+      'eval_trg_expected': trg_filename[1] + ".eval.expected",
+      'train_src_expected': src_filename[1] + ".train.expected",
+      'train_trg_expected': trg_filename[1] + ".train.expected",
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    for data_set in ['devel', 'eval', 'train']:
+      for lang in ['src', 'trg']:
+        filename = output[data_set + '_' + lang + '_filename']
+        filename_expected = box_eval[data_set + '_' + lang + '_expected']
+      thelp.diff(filename_expected, filename)
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line(range(50)))
+    thelp.cat(box_eval['trg_filename'], _line(range(50)))
+    #expected output:
+    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
new file mode 100644
index 000000000..f65d61973
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    config = dict()
+    config['irstlm_install_directory'] = args['irstlm_installation_dir']
+    config['smoothing_method'] = args['irstlm_smoothing_method']
+    config['lm_directory'] = args['language_model_directory']
+    return config
+
+def initialise(config):
+    def process(a, s):
+        # Create the LM directory if we need to
+        if os.path.exists(s['lm_directory']) is False:
+            os.makedirs(s['lm_directory'])
+
+        # The filename of the file to chew through
+        start_end_input_filename = a['input_filename']
+        if os.path.exists(start_end_input_filename) is False:
+            raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+        # Derive the output file name for the add start-end marker processor
+        filename_bits = os.path.basename(start_end_input_filename).split(".")
+        filename_bits[2] = "sb";
+        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the output file name of the LM build
+        filename_bits[2] = "lm"
+        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the compiled LM file name
+        filename_bits[2] = "arpa"
+        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # First thing to do is add start and end markers
+        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        infile = open(start_end_input_filename, 'r')
+        outfile = open(start_end_output_filename, 'w')
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+        return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+        if return_code:
+            raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+                            start_end_input_filename, start_end_output_filename, return_code)
+
+        # Next build the language model
+        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+        try:
+            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+                                "-i", start_end_output_filename,
+                                "-t", tmp_dir,
+                                "-p",
+                                "-s", s['smoothing_method'],
+                                "-o", lm_filename]
+            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+            return_code = subprocess.check_call(build_lm_cmdline)
+            if return_code: 
+                raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        # Compile the LM
+        lm_filename = lm_filename + ".gz"
+        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+                              "--text", "yes",
+                              lm_filename,
+                              compiled_lm_filename]
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+        return_code = subprocess.check_call(compile_lm_cmdline)
+        if return_code:
+            raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+        output = {'add_start_end_filename': start_end_output_filename,
+                  'lm_filename': lm_filename,
+                  'compiled_lm_filename': compiled_lm_filename}
+
+        print "IRSTLM Build: Output = %s" % output
+
+        return output
+
+    return process
+
+
+if __name__ == '__main__':
+    from pypeline.helpers.helpers import eval_pipeline
+
+    lm_dir = os.environ["PWD"]
+    configuration = {'irstlm_root': os.environ["IRSTLM"],
+                     'irstlm_smoothing_method': 'improved-kneser-ney',
+                     'language_model_directory': lm_dir}
+    component_config = configure(configuration)
+    component = initialise(component_config)
+
+    value = eval_pipeline(component,
+                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+                          component_config)
+    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+              'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+              'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+    print "Target: %s" % target
+    if value != target:
+        raise Exception("Massive fail!")
diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py
new file mode 100755
index 000000000..2b60b1720
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['mert_working_dir'] = args['mert_working_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['development_data_filename'])
+        lm_file = os.path.abspath(a['trg_language_model_filename'])
+        lm_order = int(a['trg_language_model_order'])
+        lm_type = int(a['trg_language_model_type'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        
+        if not os.path.exists(orig_moses_ini):
+            raise Exception, "Error: Input moses.ini does not exist"
+
+        workdir = os.path.abspath(config['mert_working_dir'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+
+        #local vars
+        moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+        mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+        bin_dir = os.path.join(moses_install_dir, 'bin')
+        moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+        src_file = infilename + '.' + config['src_lang']
+        ref_file = infilename + '.' + config['trg_lang']
+        logfile = os.path.join(workdir, 'log')
+        #change lm configuration in moses ini
+        moses_ini = os.path.join(workdir, 'trained-moses.ini')
+        cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+        cmd = cmd % locals()
+        os.system(cmd)
+        
+        #the command
+        cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        new_mosesini = os.path.join(workdir, 'moses.ini')
+        if not os.path.exists(new_mosesini):
+            raise Exception, 'Failed MERT'
+        
+        return {'moses_ini_file':new_mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.path.abspath('../../../../'),
+                         'mert_working_dir':'../../../../../tuning'}
+        values = {'development_data_filename':'../../../../../corpus/tune',
+                  'moses_ini_file':'../../../../../model/model/moses.ini',
+                  'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+                  'trg_language_model_type':9,
+                  'trg_language_model_order':4}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
new file mode 100755
index 000000000..e990307d2
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['training_data_filename'])
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = workdir + os.sep + 'dummy.lm'
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = workdir + os.sep + 'log'
+        
+        #the command
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+        
+        return {'moses_ini_file':mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.environ['MOSES_HOME'],
+                         'giza_installation_dir':os.environ['GIZA_HOME'],
+                         'translation_model_directory':'model-dir'}
+        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
new file mode 100755
index 000000000..57f8771df
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['src_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['src_lang'], 
+            config['moses_installation_dir'], 
+            infilename, 
+            config['src_tokenisation_dir'])
+        return {'tokenised_src_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'de',
+                         'src_tokenisation_dir':'tmptok',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'src_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
new file mode 100644
index 000000000..c6b41edbe
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
new file mode 100644
index 000000000..354ec1abc
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+    
+    @staticmethod
+    def batch_tokenise(lang, mosesdir, infilename, workdir):
+        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+        if not os.path.exists(workdir):
+            os.makedirs(workdir)
+        tok = Tokenizer(lang, mosesdir)
+        basefilename = os.path.basename(infilename)
+        outfilename = workdir + os.sep + basefilename + '.tok'
+        tok.file_tokenise(infilename, outfilename)
+        return outfilename
+        
+    def __init__(self, lang, mosesdir):
+        self.arrows = None
+        self.lang = lang
+        #check the perl tokenizer is here
+        #path = os.path.dirname(os.path.abspath(__file__))
+        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+        self.perltok = path + os.sep + 'tokenizer.perl'
+        if not os.path.exists(path):
+            raise Exception, "Perl tokenizer does not exists"
+
+    def file_tokenise(self, infilename, outfilename):
+        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+if __name__ == '__main__':
+    #do some test
+    pass
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
new file mode 100755
index 000000000..3852e296f
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['trg_lang'] = args['trg_lang']
+    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['trg_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['trg_lang'], 
+            config['moses_installation_dir'],
+            infilename, 
+            config['trg_tokenisation_dir'])
+        return {'tokenised_trg_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'trg_lang':'de',
+                         'trg_tokenisation_dir':'tmptoktrg',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'trg_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+