From f2536cddffe5dbb141387fea0a27da19e1da21e2 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Wed, 6 Mar 2013 13:37:41 +0000 Subject: [PATCH] Added arrow based Moses training pipeline demonstration program to contrib. --- .gitmodules | 3 + .../training-pipeline/moses-pypeline.dia | Bin 0 -> 3532 bytes contrib/arrow-pipelines/python/README | 32 +++ contrib/arrow-pipelines/python/libs/pypeline | 1 + contrib/arrow-pipelines/python/manager.py | 192 ++++++++++++++++++ .../arrow-pipelines/python/test/__init__.py | 0 contrib/arrow-pipelines/python/test/test.py | 11 + .../python/training/__init__.py | 0 .../python/training/components/__init__.py | 0 .../training/components/cleanup/__init__.py | 0 .../training/components/cleanup/cleanup.py | 125 ++++++++++++ .../training/components/cleanup/cleanup3.py | 109 ++++++++++ .../components/data_split/__init__.py | 0 .../components/data_split/data_split.py | 146 +++++++++++++ .../components/irstlm_build/__init__.py | 0 .../components/irstlm_build/irstlm_build.py | 106 ++++++++++ .../training/components/mert/__init__.py | 0 .../python/training/components/mert/mert.py | 83 ++++++++ .../components/model_training/__init__.py | 0 .../model_training/model_training.py | 72 +++++++ .../training/components/tokenizer/__init__.py | 0 .../components/tokenizer/src_tokenizer.py | 43 ++++ .../training/components/tokenizer/tmp.de | 3 + .../components/tokenizer/tokenizer.py | 36 ++++ .../components/tokenizer/trg_tokenizer.py | 43 ++++ 25 files changed, 1005 insertions(+) create mode 100644 contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia create mode 100644 contrib/arrow-pipelines/python/README create mode 160000 contrib/arrow-pipelines/python/libs/pypeline create mode 100644 contrib/arrow-pipelines/python/manager.py create mode 100644 contrib/arrow-pipelines/python/test/__init__.py create mode 100644 contrib/arrow-pipelines/python/test/test.py create mode 100644 contrib/arrow-pipelines/python/training/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/data_split.py create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py create mode 100644 contrib/arrow-pipelines/python/training/components/mert/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/mert/mert.py create mode 100644 contrib/arrow-pipelines/python/training/components/model_training/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/model_training/model_training.py create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py diff --git a/.gitmodules b/.gitmodules index e69de29bb..d3a8cb4da 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "contrib/arrow-pipelines/python/libs/pypeline"] + path = contrib/arrow-pipelines/python/libs/pypeline + url = git://github.com/ianj-als/pypeline.git diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia new file mode 100644 index 0000000000000000000000000000000000000000..1d35a1dea7ad04f671561738f682da9aac844ec5 GIT binary patch literal 3532 zcmV;-4Kwl|iwFP!000021MOW~Z{kQ6e(zs_=!bqW71zy|&ZI}`-5u>pJz8m!d6^gF zKn1*Q?8r8q&aQTU`&QY81hBzSl+$KTq`p9kIxbWG&Udc$)6d@*(cqg%voMKoh8Wqy zfrux`G>m6A!{6_I8F|B>KfU{D8U`QbpII6#2J#bere56)=Xw74@#^a7=?O*8S&%0w ziozwz#MS?TC<_o)cJHcxHKAKyP^>PL}oPrGc_zzr#5lv>b@K=?!H@bQMcmKZpDS!_%TWIGzjxuk@rawi6Abe=IK(LE+?A=k=$bY zmW5BB{UOZrZfy%GsX{&HgRS z!uv?nU40nmXPf@{JkxJ~I|YKjd|Y!kl(RLeh^Z&I;K!?o*)p7p?7%kL9Mdq8rpbRA08v+F_=y_b>Y@>Y;@umpNsE1cWE0Jw#GEw ziGmSsjN2lRx5+Y{h{0X*Rm7oW?DA=AV3T_wxJjQUab71|y+3#e7Gd-(FENO-;ULSO z6;{Z{y!`0jMf6SN;UstsX~$o|&W@}7nmoI4+npkG6M*WTS%O^R6xe_n*Aup>n)cW* zwyh^f62Doz5cS`vh?k2RoxM+M*D0PR<$3j@DNa`%_zbE8WAnPhE|mxIuvf%dp4?QtiqT#Tv`1&!gXn4x>RTq22lb??unOJ5B)#w~ z)2ie2^|a9(6uowbfv&g!UGW!nE0&L-A57~9CS^L@0`<_LdZ4=MfpeqkfwQ%GaBM_; z-?&AX#CmefvbaBx6fmY(>>Z?|_}ejR*XhS3z%gmKFPBi9AT(;4-|twh{ukWQtV zxKoeBtJJ$nPjFp5fnAKKqe1vvX^XWK%K3Hmf+B2_9)Jh{hiTxb-Gi5a8@2)5Xv*Cw zE@}Ijc)65ACh(DP^d*c%4Q+LF;YfL>X^|mg5RIe|$CE<1=aFm95yn_~#Hsw}G^vv$ zj>jUN77ICcdG+3rlkb51#wU;4mr$29cND{or{f?^lc(klMu_Ir<4DA_eFXhv%K0MV zfCc0JXGD4MSgb;K+~BNajO5Mg~|XOUdUHZxQ2=dV&4$I~zchSn)cl1>QP z!vkQXDT0R2QL2TRS%QXX0bD*JMB0^@^#;0>@mL34>V1H&Hzs(P0e1BpyR>VCT%%Q{ zu7wNS!XBb-k?GkN*ean(OveqLF#ssz0JX6-!S~fic+xIL8Tl49$*fdUG3<3|ac!4`o3cwD)&hCx_)afJYFnx2(XsW!c(&SeL8CRYp z+(VNL5)$6dB*PeNvVLQfcAhq)Mk_$&^d&}did>aNg?r=do~U}4W}i{v_+E_x5PP%Y zkd6HdAl63^)6VXuu4OW0C#NqXcFaM(Z?WF6tW3<*eycK|6)ycAXYv~tQ$Qx;$h5gB zAtKjDWYW%4XWXm}$oR?#lQ|je(lDGz7Xwc@@t`NXBGm3`Trt1?}u<5(O6Tv zGrA~tH;#Ag(g%RS2VhzeNYyKciL zNI7os$u8Vh`a?-kLa@{x4zdZha1ru8QDmCW#|H!5wJU4hy{!jgjf+mwaC6V?qdv`h z@ff+mQgZTSnb!dNnbZYD=Fic58*7B-C4t=dFzG5Z@8}a(&oFWIDmxxyANj0Lg=1}_ z?u#T7**J{DVgy*<-Etof>c;x+i`1^p@Tu95doh2JVT5f-oVoUk-+S+n+!3fs9aT(x56PzOkz;&!uCK=+kx)MXM+I>0SYyv&^bW} z({52nyRe3G$xSAZDU~`LG9%YS(FCYS@EEUwQDDjXe3nxCf6K_23cYG5*@qLBNA|StWjd z`|S?gcgEd!l7%or!F|_dVIQUX3d6B{qYA^ZyfUqZal{;I`MpW2DdW-eWFfYW7c7%$ zM4FGElJslQ6a$VQ>>6%1_AOMi^e~-<>Q}Q6vqwIw`O7n(u{3I_D=lC5&yo|kQ3+Vn zFcsjgOSmK5oD>cDIp!J;QmzkoxClf#hNG-fjEb^4Wz#`ve*NGBrb2U2 zf7^MA9n~%!DZe_ST7s#N@(3d)dET|>R#D~C3w~RLly}zEG~{62t>*djki$K%^*9a*$H33d zjw7~)-;m+Ata@Pfc}zoMG~c*14oOw>Q1xc=JFyMP{&eGVkl&^wQ>#J5Xh}p%3$knU zdI+Kn;OoaRFOSPt-mdvNQ^myL$OqG|mB6mq436Rq64g-tn%Ju00M`f`w#-*Pc}380 z03spKaG*24L1m9P5Dn=%8p4Ra*-a=nUg+x-Ewr49^j3zq*+Spig*9*9)&m_V^abK- zfVko_+TGpJR_(e8%8b%hO4qcNIx1nuG-*A3JwIZ?4m8b&gq;gHX}&7B2a0-rirSE; z&9uu>l&1l^Wtp@=B_YKSrw~)*X;W4v>^NQIY1c2YUavr!_T{x#(4u|xZL(pQb^!*# zMp>VnzN|K*1kv)@>dE2o$yX7FnV62%k#dcPzOP5AvNAaM{Vn*!<_?BDnIE~1Y`+W5 zIZu}=ZoEeouINeCrHUJl8SgBsu|ZXf{#4MO3iwkO?WvB-CVrbLH>O>}F!GJU%}AxA zluomegKXMCtoQU4wR4;Km~ef+7Q}kNiS}nEy(1TKR>A@J(Yo`aov--BaK8g7#`TpL zhjQjzV@_5L54dkTraKvh8^;IQHa}TLm{&**Vi~ybX|Wz zp$_`4*ROETxPy6N5{sbwI)G6(V02tHgXv=V2785T*D{O@pPwk_mxRZrD&p^B=X&HR z2qHzq|I`Gl)^^I);_7{25bD2}XBjs<%f~;C|mIjMY@BR-M%FsNC G2>}2H;Sd7= literal 0 HcmV?d00001 diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README new file mode 100644 index 000000000..e1e12975c --- /dev/null +++ b/contrib/arrow-pipelines/python/README @@ -0,0 +1,32 @@ +Arrow Based Moses Training Pipeline +=================================== + +To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command: + +$ git submodule init + +This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline: + +$ cd libs/pypeline +$ python setup.py install + +Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library. + +This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia. + +Three environment variables need to be set before the manager.py script can be run, they are: + + - MOSES_HOME : The directory where Moses has been cloned, or installed, + - IRSTLM : The installation directory of your IRSTLM, and + - GIZA_HOME : The installation directory of GIZA++. + +The manager.py script takes four positional command-line arguments: + + - The source language code, + - The target language code, + - The source corpus file. This file *must* be cleaned prior to use, and + - The target corpus file. This file *must* be cleaned prior to use. + +For example, run the manager.py script with: + +$ python manager.py en lt cleantrain.en cleantrain.lt diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline new file mode 160000 index 000000000..a7084b686 --- /dev/null +++ b/contrib/arrow-pipelines/python/libs/pypeline @@ -0,0 +1 @@ +Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py new file mode 100644 index 000000000..1c3ece111 --- /dev/null +++ b/contrib/arrow-pipelines/python/manager.py @@ -0,0 +1,192 @@ +import logging +import os + +from concurrent.futures import Future, ThreadPoolExecutor +from functools import partial +from pypeline.helpers.parallel_helpers import eval_pipeline, \ + cons_function_component, \ + cons_wire, \ + cons_split_wire, \ + cons_unsplit_wire, \ + cons_dictionary_wire + + +# +# Some logging please +# +FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s' +logging.basicConfig(format = FORMAT, level = logging.DEBUG) +logger = logging.getLogger("manager") + + +# Build the pipeline components +def build_components(components, configuration, executor): + pipeline_components = dict() + pipeline_configuration = dict() + + for component_id, module_name in components.items(): + logger.info("Loading [%s] component from [%s]..." % (component_id, module_name)) + + module = __import__(module_name, fromlist = ['configure', 'initialise']) + + # Component builds its own configuration object + config_func = getattr(module, 'configure') + component_config = config_func(configuration) + pipeline_configuration.update(component_config) + + # Now build the component + init_func = getattr(module, 'initialise') + component_function = init_func(component_config) + + # A wrapper for the component's function that submits to the executor + def get_component_function_wrapper(inner_function, comp_id, mod_name): + def component_function_wrapper(a, s): + logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \ + (comp_id, mod_name, a, s)) + return inner_function(a, s) + + return component_function_wrapper + + # Arrowize the component + component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name)) + + # And store + pipeline_components[component_id] = component + + return pipeline_components, pipeline_configuration + + +# Go! +def main(src_lang, trg_lang, src_filename, trg_filename): + # Global configuration + # One day, this configuration shall be constructed from + # command line options, or a properties file. + configuration = { + 'moses_installation_dir': os.environ['MOSES_HOME'], + 'irstlm_installation_dir': os.environ['IRSTLM'], + 'giza_installation_dir': os.environ['GIZA_HOME'], + 'src_lang': src_lang, + 'src_tokenisation_dir': './tokenisation', + 'trg_lang': trg_lang, + 'trg_tokenisation_dir': './tokenisation', + 'segment_length_limit': 60, + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': './language-model', + 'translation_model_directory': './translation-model', + 'mert_working_directory': './mert', + 'evaluation_data_size': 100, + 'development_data_size': 100 + } + + # The modules to load + # In the future, the components shall be specified in some kind + # pipeline description file. + component_modules = { + 'src_tokenizer': 'training.components.tokenizer.src_tokenizer', + 'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer', + 'cleanup': 'training.components.cleanup.cleanup', + 'data_split': 'training.components.data_split.data_split', + 'irstlm_build': 'training.components.irstlm_build.irstlm_build', + 'model_training': 'training.components.model_training.model_training', + 'mert': 'training.components.mert.mert' + } + + # The thread pool + executor = ThreadPoolExecutor(max_workers = 3) + + # Phew, build the required components + components, component_config = build_components(component_modules, configuration, executor) + + # + # Wire up components + # Description of wiring should be, in the future, alongside the component + # specification in some kind of confuguration file. Components shall be + # declared then used, i.e., bind a component instance to a unique component + # identifier, then wire component instances together by identifier. + # + + # + # Tokenisation of source and target... + # + # IRSTLM Build components + irstlm_build_component = cons_split_wire() >> \ + (cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \ + components['irstlm_build']).second() >> \ + cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'], + 'trg_language_model_filename': b['compiled_lm_filename']}) + + # The complete tokenisation component + tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \ + irstlm_build_component.second() >> \ + cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'], + 'trg_filename': b['tokenised_trg_filename'], + 'trg_language_model_filename': b['trg_language_model_filename']}) + + # + # Cleanup and Data Spliting... + # + + # + # A function that clips off the last '.' delimited string + # + def clip_last_bit(filename): + bn = os.path.basename(filename) + directory = os.path.dirname(filename) + bits = bn.split(".") + bits.pop() + return os.path.join(directory, ".".join(bits)) + + cleanup_datasplit_component = components['cleanup'] >> \ + cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'], + 'trg_filename': a['cleaned_trg_filename']}) >> \ + components['data_split'] >> \ + cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']), + 'eval_src_filename': a['eval_src_filename'], + 'eval_trg_filename': a['eval_trg_filename']}) + + # + # Translation model training + # + translation_model_component = cons_split_wire() >> \ + components['model_training'].first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': b['eval_src_filename']}) + + # + # The whole pipeline + # + pipeline = tokenisation_component >> \ + cons_split_wire() >> \ + (cleanup_datasplit_component >> translation_model_component).first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': clip_last_bit(t['development_data_filename']), + 'trg_language_model_filename': b['trg_language_model_filename'], + 'trg_language_model_order': 3, + 'trg_language_model_type': 9}) >> \ + components['mert'] + + + # + # The input to the pipeline + # + value = {'src_filename': src_filename, + 'trg_filename': trg_filename} + + # + # Evaluate the pipeline + # + logger.info("Evaluating pipeline with input [%s]..." % value) + new_value = eval_pipeline(executor, pipeline, value, component_config) + + # + # Wait for all components to finish + # + executor.shutdown(True) + + logger.info("Pipeline evaluated to %s" % new_value) + + +if __name__ == '__main__': + import sys + + main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py new file mode 100644 index 000000000..628796f7d --- /dev/null +++ b/contrib/arrow-pipelines/python/test/test.py @@ -0,0 +1,11 @@ +import subprocess + +def cat(filename, content): + fh = open(filename, "w") + for line in content: + #print(line, file=fh) + print >> fh, line + fh.close() + +def diff(filename1, filename2): + subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT) diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py new file mode 100644 index 000000000..cb2e057ce --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py @@ -0,0 +1,125 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print >>ofh1, l1, + print >>ofh2, l2, + + def _make_cleaned_filename(filename): + bits = filename.split(".") + bits[-1] = "clean" + return ".".join(bits) + + def _filter_main(value, config): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + + cleaned_src_filename = _make_cleaned_filename(input_src_filename) + cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) + ofh1 = open(cleaned_src_filename, "w") + ofh2 = open(cleaned_trg_filename, "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': cleaned_src_filename, + 'cleaned_trg_filename': cleaned_trg_filename} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _filter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + try: + thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) + finally: + os.unlink(output['cleaned_src_filename']) + os.unlink(output['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py new file mode 100644 index 000000000..27625c612 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py @@ -0,0 +1,109 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print(l1, end='', file=ofh1) + print(l2, end='', file=ofh2) + + def _filter_main(config, value): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + ifh1 = open(value['src_filename'], "r") + ifh2 = open(value['trg_filename'], "r") + ofh1 = open(value['cleaned_src_filename'], "w") + ofh2 = open(value['cleaned_trg_filename'], "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': value['cleaned_src_filename'], + 'cleaned_trg_filename': value['cleaned_trg_filename']} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return cons_function_component(_filter_main) + + +if __name__ == '__main__': + import os + import tempfile + import training.components.shared.test as thelp + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_filename': src_filename[1] + ".clean", + 'cleaned_trg_filename': trg_filename[1] + ".clean", + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + + run_pipeline(box, box_config, box_eval) + thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py new file mode 100644 index 000000000..b8469cbf6 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py @@ -0,0 +1,146 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['evaluate_size'] = args['evaluation_data_size'] + result['development_size'] = args['development_data_size'] + return result + +def initialise(config): + + def _copy(size, inp, ofh1, ofh2): + try: + while size != 0: + (l1, l2) = inp.next() + print >>ofh1, l1, + print >>ofh2, l2, + size -= 1 + except StopIteration: + pass + + def _make_split_filename(filename, data_set): + bits = filename.split(".") + last = bits.pop() + lang_code = bits.pop() + + bits.append(last) + bits.append(data_set) + bits.append(lang_code) + + new_filename = ".".join(bits) + return new_filename + + def _splitter_main(value, config): + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + inp = iter(zip(ifh1, ifh2)) + + result = {} + for (data_set, size) in [ + ('devel', config['development_size']), + ('eval', config['evaluate_size']), + ('train', -1) + ]: + output_src_filename = _make_split_filename(input_src_filename, data_set) + output_trg_filename = _make_split_filename(input_trg_filename, data_set) + ofh1 = open(output_src_filename, "w") + ofh2 = open(output_trg_filename, "w") + + _copy(size, inp, ofh1, ofh2) + result[data_set + '_src_filename'] = output_src_filename + result[data_set + '_trg_filename'] = output_trg_filename + + return result + + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _splitter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = { + 'evaluation_data_size': 7, + 'development_data_size': 13, + } + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'devel_src_expected': src_filename[1] + ".devel.expected", + 'devel_trg_expected': trg_filename[1] + ".devel.expected", + 'eval_src_expected': src_filename[1] + ".eval.expected", + 'eval_trg_expected': trg_filename[1] + ".eval.expected", + 'train_src_expected': src_filename[1] + ".train.expected", + 'train_trg_expected': trg_filename[1] + ".train.expected", + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + for data_set in ['devel', 'eval', 'train']: + for lang in ['src', 'trg']: + filename = output[data_set + '_' + lang + '_filename'] + filename_expected = box_eval[data_set + '_' + lang + '_expected'] + thelp.diff(filename_expected, filename) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line(range(50))) + thelp.cat(box_eval['trg_filename'], _line(range(50))) + #expected output: + thelp.cat(box_eval['devel_src_expected'], _line(range(0,13))) + thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13))) + thelp.cat(box_eval['eval_src_expected'], _line(range(13,20))) + thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20))) + thelp.cat(box_eval['train_src_expected'], _line(range(20,50))) + thelp.cat(box_eval['train_trg_expected'], _line(range(20,50))) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py new file mode 100644 index 000000000..f65d61973 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py @@ -0,0 +1,106 @@ +import os +import shutil +import subprocess +import tempfile + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + config = dict() + config['irstlm_install_directory'] = args['irstlm_installation_dir'] + config['smoothing_method'] = args['irstlm_smoothing_method'] + config['lm_directory'] = args['language_model_directory'] + return config + +def initialise(config): + def process(a, s): + # Create the LM directory if we need to + if os.path.exists(s['lm_directory']) is False: + os.makedirs(s['lm_directory']) + + # The filename of the file to chew through + start_end_input_filename = a['input_filename'] + if os.path.exists(start_end_input_filename) is False: + raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename) + + # Derive the output file name for the add start-end marker processor + filename_bits = os.path.basename(start_end_input_filename).split(".") + filename_bits[2] = "sb"; + start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the output file name of the LM build + filename_bits[2] = "lm" + lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the compiled LM file name + filename_bits[2] = "arpa" + compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # First thing to do is add start and end markers + start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")] + infile = open(start_end_input_filename, 'r') + outfile = open(start_end_output_filename, 'w') + print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline) + return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile) + if return_code: + raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \ + start_end_input_filename, start_end_output_filename, return_code) + + # Next build the language model + tmp_dir = tempfile.mkdtemp(dir = "/tmp") + try: + build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"), + "-i", start_end_output_filename, + "-t", tmp_dir, + "-p", + "-s", s['smoothing_method'], + "-o", lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline) + return_code = subprocess.check_call(build_lm_cmdline) + if return_code: + raise Exception("IRST language model failed to build: return code = [%d]" % return_code) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Compile the LM + lm_filename = lm_filename + ".gz" + compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"), + "--text", "yes", + lm_filename, + compiled_lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline) + return_code = subprocess.check_call(compile_lm_cmdline) + if return_code: + raise Exception("IRST language model compilation failed: return code = [%d]" % return_code) + + output = {'add_start_end_filename': start_end_output_filename, + 'lm_filename': lm_filename, + 'compiled_lm_filename': compiled_lm_filename} + + print "IRSTLM Build: Output = %s" % output + + return output + + return process + + +if __name__ == '__main__': + from pypeline.helpers.helpers import eval_pipeline + + lm_dir = os.environ["PWD"] + configuration = {'irstlm_root': os.environ["IRSTLM"], + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': lm_dir} + component_config = configure(configuration) + component = initialise(component_config) + + value = eval_pipeline(component, + {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'}, + component_config) + target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'), + 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'), + 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')} + print "Target: %s" % target + if value != target: + raise Exception("Massive fail!") diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py new file mode 100755 index 000000000..2b60b1720 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['mert_working_dir'] = args['mert_working_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['development_data_filename']) + lm_file = os.path.abspath(a['trg_language_model_filename']) + lm_order = int(a['trg_language_model_order']) + lm_type = int(a['trg_language_model_type']) + orig_moses_ini = os.path.abspath(a['moses_ini_file']) + + if not os.path.exists(orig_moses_ini): + raise Exception, "Error: Input moses.ini does not exist" + + workdir = os.path.abspath(config['mert_working_dir']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + moses_install_dir = os.path.abspath(config['moses_installation_dir']) + mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl') + bin_dir = os.path.join(moses_install_dir, 'bin') + moses_bin = os.path.join(moses_install_dir, 'bin', 'moses') + src_file = infilename + '.' + config['src_lang'] + ref_file = infilename + '.' + config['trg_lang'] + logfile = os.path.join(workdir, 'log') + #change lm configuration in moses ini + moses_ini = os.path.join(workdir, 'trained-moses.ini') + cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s" + cmd = cmd % locals() + os.system(cmd) + + #the command + cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + new_mosesini = os.path.join(workdir, 'moses.ini') + if not os.path.exists(new_mosesini): + raise Exception, 'Failed MERT' + + return {'moses_ini_file':new_mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.path.abspath('../../../../'), + 'mert_working_dir':'../../../../../tuning'} + values = {'development_data_filename':'../../../../../corpus/tune', + 'moses_ini_file':'../../../../../model/model/moses.ini', + 'trg_language_model_filename':'../../../../../corpus/train.lt.lm', + 'trg_language_model_type':9, + 'trg_language_model_order':4} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py new file mode 100755 index 000000000..e990307d2 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['external_bin_dir'] = args['giza_installation_dir'] + result['model_directory'] = args['translation_model_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['training_data_filename']) + workdir = os.path.abspath(config['model_directory']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl' + src_lang = config['src_lang'].lower() + trg_lang = config['trg_lang'].lower() + external_bin = os.path.abspath(config['external_bin_dir']) + #create a dummy lm file + dummy_lmfile = workdir + os.sep + 'dummy.lm' + f = open(dummy_lmfile, 'w') + print >> f, "dummy lm file" + f.close() + logfile = workdir + os.sep + 'log' + + #the command + cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s' + + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini' + if not os.path.exists(mosesini): + raise Exception, 'Failed training model' + + return {'moses_ini_file':mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.environ['MOSES_HOME'], + 'giza_installation_dir':os.environ['GIZA_HOME'], + 'translation_model_directory':'model-dir'} + values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py new file mode 100755 index 000000000..57f8771df --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['src_tokenisation_dir'] = args['src_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['src_filename'] + outfilename = Tokenizer.batch_tokenise( + config['src_lang'], + config['moses_installation_dir'], + infilename, + config['src_tokenisation_dir']) + return {'tokenised_src_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'de', + 'src_tokenisation_dir':'tmptok', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'src_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de new file mode 100644 index 000000000..c6b41edbe --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de @@ -0,0 +1,3 @@ +asdfweoih +awfwoeijf awefo +what's this diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py new file mode 100644 index 000000000..354ec1abc --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import sys, os, subprocess + +class Tokenizer: + + @staticmethod + def batch_tokenise(lang, mosesdir, infilename, workdir): + print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir) + if not os.path.exists(workdir): + os.makedirs(workdir) + tok = Tokenizer(lang, mosesdir) + basefilename = os.path.basename(infilename) + outfilename = workdir + os.sep + basefilename + '.tok' + tok.file_tokenise(infilename, outfilename) + return outfilename + + def __init__(self, lang, mosesdir): + self.arrows = None + self.lang = lang + #check the perl tokenizer is here + #path = os.path.dirname(os.path.abspath(__file__)) + path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer' + self.perltok = path + os.sep + 'tokenizer.perl' + if not os.path.exists(path): + raise Exception, "Perl tokenizer does not exists" + + def file_tokenise(self, infilename, outfilename): + cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename) + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + +if __name__ == '__main__': + #do some test + pass + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py new file mode 100755 index 000000000..3852e296f --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['trg_lang'] = args['trg_lang'] + result['trg_tokenisation_dir'] = args['trg_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['trg_filename'] + outfilename = Tokenizer.batch_tokenise( + config['trg_lang'], + config['moses_installation_dir'], + infilename, + config['trg_tokenisation_dir']) + return {'tokenised_trg_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'trg_lang':'de', + 'trg_tokenisation_dir':'tmptoktrg', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'trg_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() +