Added PCL based arrow pipelining

This commit is contained in:
Ian Johnson 2013-05-22 10:25:25 +01:00
parent 2f38746f6a
commit 166eb7d0e0
46 changed files with 70404 additions and 811 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
path = contrib/arrow-pipelines/python/libs/pypeline
url = git://github.com/ianj-als/pypeline.git
[submodule "contrib/arrow-pipelines/python/pcl"]
path = contrib/arrow-pipelines/python/pcl
url = git://github.com/ianj-als/pcl.git

View File

@ -0,0 +1,10 @@
[Configuration]
tokeniser.src.language = en
tokeniser.src.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
tokeniser.trg.language = lt
tokeniser.trg.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
tokeniser.moses.installation = /opt/moses
[Inputs]
src_filename = test_data/src_trg_tokenizer/cleantrain.en
trg_filename = test_data/src_trg_tokenizer/cleantrain.lt

View File

@ -0,0 +1,40 @@
#
# Import all of the components to be composed
#
import wrappers.tokenizer.tokenizer as tokeniser
#
# Component definition
#
# +---------+ +---------+ +---------+ +---------+
# src_filename -->+ +--> filename -->+-- src --+--> tokenised_filename -->+---------+--> tokenised_filename -->+ +--> tokenised_src_filename
# | | | | | | | |
# trg_filename -->+ +--> filename -->+---------+-------> filename ------->+-- trg --+--> tokenised_filename -->+ +--> tokenised_trg_filename
# +---------+ +---------+ +---------+ +---------+
# Config: {language::String, Config: {language::String,
# tokenisation_dir::String, tokenisation_dir::String,
# moses_installation_dir::String} moses_installation_dir::String}
#
component src_trg_tokeniser
inputs (src_filename), (trg_filename)
outputs (tokenised_src_filename), (tokenised_trg_filename)
configuration tokeniser.src.language,
tokeniser.src.tokenisation_dir,
tokeniser.trg.language,
tokeniser.trg.tokenisation_dir,
tokeniser.moses.installation
declare
src_tokeniser := new tokeniser with
tokeniser.src.language -> language,
tokeniser.src.tokenisation_dir -> tokenisation_dir,
tokeniser.moses.installation -> moses_installation_dir
trg_tokeniser := new tokeniser with
tokeniser.trg.language -> language,
tokeniser.trg.tokenisation_dir -> tokenisation_dir,
tokeniser.moses.installation -> moses_installation_dir
as
wire (src_filename -> filename),
(trg_filename -> filename) >>>
(src_tokeniser *** trg_tokeniser) >>>
wire (tokenised_filename -> tokenised_src_filename),
(tokenised_filename -> tokenised_trg_filename)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
[Configuration]
model_training.max_segment_length = 20
model_training.corpus.development_size = 4500
model_training.corpus.evaluation_size = 5000
model_training.src.language = en
model_training.trg.language = lt
model_training.method.alignment = grow-diag-final-and
model_training.method.reordering = msd-bidirectional-fe
model_training.moses.installation = /opt/moses
model_training.giza.installation = /opt/moses/giza++-v1.0.7
model_training.translation_model.dir = test_data/translation_model_training/translation_model
[Inputs]
src_filename = test_data/translation_model_training/cleantrain.en
trg_filename = test_data/translation_model_training/cleantrain.lt

View File

@ -0,0 +1,68 @@
#
# Import all of the components to be composed
#
import wrappers.cleanup.cleanup as cleanup
import wrappers.data_split.data_split as data_split
import wrappers.model_training.model_training as model_training
#
# Component definition
#
# {cleaned_src_filename, {src_filename, {[devel|eval|train]_src_filename, {src_filename, {moses_ini_file,
# cleaned_trg_filename} trg_filename} [devel|eval|train]_trg_filename} trg_filename} evaluation_data_filename}
# | | | | +-------+ |
# +-------+ | | +-------+ | +-------+ V | Model | {moses_ini_file} +-------+ V
# | Clean | V V | Data | V | +---------------->+ Train +----------------->+ Merge +----->
# {src_filename, -->+ +----->+ +------------->+ Split | +-------+ +---+---+
# trg_filename} | Up | | Split | | +---\ Config: {[src|trg]_language::String, ^
# +-------+ +-------+ +-------+ | alignment_method::String, |
# Config: {segment_length::Int} Config: {development_size::Int, | reordering_method::String, |
# evaluation_size::Int} | giza_installation_dir::String, |
# | model_directory::String} |
# \--------------------------------------------/
#
component translation_model_training
inputs src_filename, trg_filename
outputs evaluation_data_filename, moses_ini_filename
configuration model_training.max_segment_length,
model_training.corpus.development_size,
model_training.corpus.evaluation_size,
model_training.src.language,
model_training.trg.language,
model_training.method.alignment,
model_training.method.reordering,
model_training.moses.installation,
model_training.giza.installation,
model_training.translation_model.dir
declare
cleanup := new cleanup with
model_training.max_segment_length -> segment_length_limit
data_split := new data_split with
model_training.corpus.development_size -> development_data_size,
model_training.corpus.evaluation_size -> evaluation_data_size
model_training := new model_training with
model_training.src.language -> source_language,
model_training.trg.language -> target_language,
model_training.method.alignment -> alignment_method,
model_training.method.reordering -> reordering_method,
model_training.moses.installation -> moses_installation_dir,
model_training.giza.installation -> giza_installation_dir,
model_training.translation_model.dir -> translation_model_directory
as
cleanup >>>
wire cleaned_src_filename -> src_filename,
cleaned_trg_filename -> trg_filename >>>
data_split >>>
wire devel_src_filename -> devel_src_filename,
eval_src_filename -> evaluation_data_filename,
train_trg_filename -> _,
train_src_filename -> _,
eval_trg_filename -> _,
devel_trg_filename -> devel_trg_filename >>>
((wire devel_src_filename -> src_filename,
devel_trg_filename -> trg_filename,
evaluation_data_filename -> _ >>>
model_training) &&&
wire evaluation_data_filename -> evaluation_data_filename) >>>
merge top[moses_ini_filename] -> moses_ini_filename,
bottom[evaluation_data_filename] -> evaluation_data_filename

View File

@ -0,0 +1,129 @@
def get_name():
return 'cleanup'
def get_inputs():
return ['src_filename', 'trg_filename']
def get_outputs():
return ['cleaned_src_filename', 'cleaned_trg_filename']
def get_configuration():
return ['segment_length_limit']
def configure(args):
return {'segment_length' : args['segment_length_limit']}
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits.insert(-1, "clean")
return ".".join(bits)
def _filter_main(a, s):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = a['src_filename']
input_trg_filename = a['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,7 @@
[Configuration]
evaluation_data_size = 7
development_data_size = 13
[Inputs]
src_filename = test_data/data.en
trg_filename = test_data/data.de

View File

@ -0,0 +1,144 @@
def get_name():
return 'data_split'
def get_inputs():
return ['src_filename', 'trg_filename']
def get_outputs():
return ['devel_src_filename', 'devel_trg_filename',
'eval_src_filename', 'eval_trg_filename',
'train_src_filename', 'train_trg_filename']
def get_configuration():
return ['evaluation_data_size', 'development_data_size']
def configure(args):
result = {}
result['evaluate_size'] = args['evaluation_data_size']
result['development_size'] = args['development_data_size']
return result
def initialise(config):
def _copy(size, inp, ofh1, ofh2):
try:
while size != 0:
(l1, l2) = inp.next()
print >>ofh1, l1,
print >>ofh2, l2,
size -= 1
except StopIteration:
pass
def _make_split_filename(filename, data_set):
bits = filename.split(".")
bits.insert(-1, data_set)
new_filename = ".".join(bits)
return new_filename
def _splitter_main(a, s):
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = a['src_filename']
input_trg_filename = a['trg_filename']
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
inp = iter(zip(ifh1, ifh2))
result = {}
for (data_set, size) in [('devel', config['development_size']),
('eval', config['evaluate_size']),
('train', -1)]:
output_src_filename = _make_split_filename(input_src_filename, data_set)
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
ofh1 = open(output_src_filename, "w")
ofh2 = open(output_trg_filename, "w")
_copy(size, inp, ofh1, ofh2)
result[data_set + '_src_filename'] = output_src_filename
result[data_set + '_trg_filename'] = output_trg_filename
return result
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _splitter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'evaluation_data_size': 7,
'development_data_size': 13}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'devel_src_expected': src_filename[1] + ".devel.expected",
'devel_trg_expected': trg_filename[1] + ".devel.expected",
'eval_src_expected': src_filename[1] + ".eval.expected",
'eval_trg_expected': trg_filename[1] + ".eval.expected",
'train_src_expected': src_filename[1] + ".train.expected",
'train_trg_expected': trg_filename[1] + ".train.expected"}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
for data_set in ['devel', 'eval', 'train']:
for lang in ['src', 'trg']:
filename = output[data_set + '_' + lang + '_filename']
filename_expected = box_eval[data_set + '_' + lang + '_expected']
thelp.diff(filename_expected, filename)
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line(range(50)))
thelp.cat(box_eval['trg_filename'], _line(range(50)))
#expected output:
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,50 @@
tok0
tok0 tok1
tok0 tok1 tok2
tok0 tok1 tok2 tok3
tok0 tok1 tok2 tok3 tok4
tok0 tok1 tok2 tok3 tok4 tok5
tok0 tok1 tok2 tok3 tok4 tok5 tok6
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48

View File

@ -0,0 +1,50 @@
tok0
tok0 tok1
tok0 tok1 tok2
tok0 tok1 tok2 tok3
tok0 tok1 tok2 tok3 tok4
tok0 tok1 tok2 tok3 tok4 tok5
tok0 tok1 tok2 tok3 tok4 tok5 tok6
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48

View File

@ -3,20 +3,31 @@ import shutil
import subprocess
import tempfile
from pypeline.helpers.helpers import cons_function_component
def get_name():
return 'irstlm_build'
def get_inputs():
return ['input_filename']
def get_outputs():
return ['add_start_end_filename', 'lm_filename', 'compiled_lm_filename']
def get_configuration():
return ['irstlm_installation_dir', 'irstlm_smoothing_method', 'language_model_directory']
def configure(args):
config = dict()
config['irstlm_install_directory'] = args['irstlm_installation_dir']
config['smoothing_method'] = args['irstlm_smoothing_method']
config['lm_directory'] = args['language_model_directory']
return config
return config
def initialise(config):
def process(a, s):
# Create the LM directory if we need to
if os.path.exists(s['lm_directory']) is False:
os.makedirs(s['lm_directory'])
if os.path.exists(config['lm_directory']) is False:
os.makedirs(config['lm_directory'])
# The filename of the file to chew through
start_end_input_filename = a['input_filename']
@ -26,18 +37,18 @@ def initialise(config):
# Derive the output file name for the add start-end marker processor
filename_bits = os.path.basename(start_end_input_filename).split(".")
filename_bits[2] = "sb";
start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
start_end_output_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
# Derive the output file name of the LM build
filename_bits[2] = "lm"
lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
# Derive the compiled LM file name
filename_bits[2] = "arpa"
compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
compiled_lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
# First thing to do is add start and end markers
start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
start_end_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "add-start-end.sh")]
infile = open(start_end_input_filename, 'r')
outfile = open(start_end_output_filename, 'w')
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
@ -49,11 +60,11 @@ def initialise(config):
# Next build the language model
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
try:
build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
build_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "build-lm.sh"),
"-i", start_end_output_filename,
"-t", tmp_dir,
"-p",
"-s", s['smoothing_method'],
"-s", config['smoothing_method'],
"-o", lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
return_code = subprocess.check_call(build_lm_cmdline)
@ -65,7 +76,7 @@ def initialise(config):
# Compile the LM
lm_filename = lm_filename + ".gz"
compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
compile_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "compile-lm"),
"--text", "yes",
lm_filename,
compiled_lm_filename]
@ -86,7 +97,7 @@ def initialise(config):
if __name__ == '__main__':
from pypeline.helpers.helpers import eval_pipeline
from pypeline.helpers.helpers import eval_pipeline, cons_function_component
lm_dir = os.environ["PWD"]
configuration = {'irstlm_root': os.environ["IRSTLM"],
@ -95,7 +106,7 @@ if __name__ == '__main__':
component_config = configure(configuration)
component = initialise(component_config)
value = eval_pipeline(component,
value = eval_pipeline(cons_function_component(component),
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
component_config)
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),

View File

@ -1,25 +1,38 @@
#!/usr/bin/env python
import os
import shutil
import subprocess
import os, shutil, subprocess
def get_name():
return 'mert'
from pypeline.helpers.helpers import cons_function_component
def get_inputs():
return ['evaluation_data_filename', 'trg_language_model_filename',
'trg_language_model_order', 'trg_language_model_type',
'moses_ini_filename']
def get_outputs():
return ['moses_ini_filename']
def get_configuration():
return ['source_language', 'target_language',
'moses_installation_dir', 'mert_working_directory']
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['src_lang'] = args['source_language']
result['trg_lang'] = args['target_language']
result['moses_installation_dir'] = args['moses_installation_dir']
result['mert_working_dir'] = args['mert_working_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['development_data_filename'])
infilename = os.path.abspath(a['evaluation_data_filename'])
infilename = ".".join(infilename.split(".")[:-1])
lm_file = os.path.abspath(a['trg_language_model_filename'])
lm_order = int(a['trg_language_model_order'])
lm_type = int(a['trg_language_model_type'])
orig_moses_ini = os.path.abspath(a['moses_ini_file'])
orig_moses_ini = os.path.abspath(a['moses_ini_filename'])
if not os.path.exists(orig_moses_ini):
raise Exception, "Error: Input moses.ini does not exist"
@ -57,12 +70,12 @@ def initialise(config):
if not os.path.exists(new_mosesini):
raise Exception, 'Failed MERT'
return {'moses_ini_file':new_mosesini}
return {'moses_ini_filename' : new_mosesini}
return process
if __name__ == '__main__':
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
@ -80,4 +93,3 @@ if __name__ == '__main__':
#do some test
__test()

View File

@ -0,0 +1,103 @@
import os
import shutil
import subprocess
def get_name():
return 'model_training'
def get_inputs():
return ['src_filename', 'trg_filename']
def get_outputs():
return ['moses_ini_filename']
def get_configuration():
return ['source_language', 'target_language',
'moses_installation_dir', 'giza_installation_dir',
'translation_model_directory', 'alignment_method',
'reordering_method']
# Alignment = grow-diag-final-and
# Reordering = msd-bidirectional-fe
def configure(args):
result = {}
result['src_lang'] = args['source_language']
result['trg_lang'] = args['target_language']
result['moses_installation_dir'] = args['moses_installation_dir']
result['external_bin_dir'] = args['giza_installation_dir']
result['model_directory'] = args['translation_model_directory']
result['alignment'] = args['alignment_method']
result['reordering'] = args['reordering_method']
return result
def initialise(config):
def process(a, s):
get_corpora_name_fn = lambda fn: ".".join(os.path.basename(fn).split('.')[:-1])
src_filename = os.path.abspath(a['src_filename'])
trg_filename = os.path.abspath(a['trg_filename'])
src_corpora_name = get_corpora_name_fn(src_filename)
trg_corpora_name = get_corpora_name_fn(trg_filename)
if src_corpora_name != trg_corpora_name:
raise Exception, "Mismatch of source [%s] and target [%s] filename" % (src_filename, trg_filename)
infilename = os.path.abspath(os.path.join(os.path.dirname(src_filename), src_corpora_name))
workdir = os.path.abspath(config['model_directory'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
train_model_perl = os.path.abspath(os.path.join(config['moses_installation_dir'],
'scripts',
'training',
'train-model.perl'))
src_lang = config['src_lang'].lower()
trg_lang = config['trg_lang'].lower()
external_bin = os.path.abspath(config['external_bin_dir'])
#create a dummy lm file
dummy_lmfile = os.path.join(workdir, 'dummy.lm')
f = open(dummy_lmfile, 'w')
print >> f, "dummy lm file"
f.close()
logfile = os.path.join(workdir, 'log')
#the command
alignment_method = config['alignment']
reordering_method = config['reordering']
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s ' \
'-f %(src_lang)s -e %(trg_lang)s -alignment %(alignment_method)s ' \
'-reordering %(reordering_method)s -lm 0:5:%(dummy_lmfile)s:0 ' \
'-external-bin-dir %(external_bin)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
# check the moses ini
mosesini = os.path.join(workdir, 'model', 'moses.ini')
if not os.path.exists(mosesini):
raise Exception, 'Failed training model'
return {'moses_ini_filename' : mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang' : 'en',
'trg_lang' : 'lt',
'moses_installation_dir' : os.environ['MOSES_HOME'],
'giza_installation_dir' : os.environ['GIZA_HOME'],
'translation_model_directory' : 'model-dir'}
values = {'training_data_filename' : '/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
print run_pipeline(box, values, None)
#do some test
__test()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
[Configuration]
language = en
tokenisation_dir = tokenised
moses_installation_dir = /opt/moses
[Inputs]
filename = test_data/test.en

View File

@ -0,0 +1,58 @@
import sys, os, subprocess
class BatchTokenizer(object):
def __init__(self, language, working_dir, moses_installation_dir):
# Ensure the perl tokenizer is exists
self.__tokeniser = os.path.join(moses_installation_dir,
'scripts',
'tokenizer',
'tokenizer.perl')
if not os.path.exists(self.__tokeniser):
raise Exception("Perl tokenizer does not exist at [%s]" % self.__tokeniser)
self.__working_dir = working_dir
if not os.path.exists(self.__working_dir):
os.makedirs(self.__working_dir)
self.__language = language
def tokenise(self, filename):
basefilename = os.path.basename(filename)
bits = basefilename.split(".")
bits.insert(-1, "tok")
basefilename = ".".join(bits)
outfilename = os.path.join(self.__working_dir, basefilename)
cmd = '%s -q -l %s < %s > %s' % (self.__tokeniser, self.__language, filename, outfilename)
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
return outfilename
def get_inputs():
return ['filename']
def get_outputs():
return ['tokenised_filename']
def get_configuration():
return ['language',
'tokenisation_dir',
'moses_installation_dir']
def configure(args):
return {'language' : args['language'],
'tokenisation_dir' : args['tokenisation_dir'],
'moses_installation_dir' : args['moses_installation_dir']}
def initialise(config):
tokenizer = BatchTokenizer(config['language'],
config['tokenisation_dir'],
config['moses_installation_dir'])
def process(a, s):
tokenised_filename = tokenizer.tokenise(a['filename'])
return {'tokenised_filename' : tokenised_filename}
return process

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
[Configuration]
source_language = en
target_language = lt
max_segment_length = 20
corpus_development_size = 8000
corpus_evaluation_size = 1000
alignment_method = grow-diag-final-and
reordering_method = msd-bidirectional-fe
smoothing_method = improved-kneser-ney
tokenisation_directory = test_data/tokenisation
translation_model_directory = test_data/model
language_model_directory = test_data/lm
mert_directory = test_data/mert
moses_installation_directory = /opt/moses
giza_installation_directory = /opt/moses/giza++-v1.0.7
irstlm_installation_directory = /opt/moses/irstlm-5.70.04
[Inputs]
src_filename = test_data/cleantrain.en
trg_filename = test_data/cleantrain.lt

View File

@ -0,0 +1,115 @@
#
# Import all of the components to be composed
#
import components.src_trg_tokeniser as tokeniser
import components.translation_model_training as model_training
import components.wrappers.irstlm_build.irstlm_build as lang_model
import components.wrappers.mert.mert as mert
#
# Component definition
#
# Config: {model_training.max_segment_length,
# model_training.corpus.[development_size|evaluation_size],
# model_training.[src|trg].language,
# model_training.method.[alignment|reordering], {moses_ini_filename,
# model_training.giza.installation, evaluation_data_filename}
# {src_filename, {tokenised_src_filename, model_training.translation_model.dir} |
# trg_filename} tokenised_trg_filename} +-----------------------------------------+ +-------+ | {moses_ini_filename}
# | +-------+ +-------+ +-------+ | +-------+ | tokenised_src_filename -> src_filename, | | Model | V +-------+ |
# V | +--->+ Src/ +--->+ | V | +-->+ tokenised_trg_filename -> trg_filename +-->+ Train +------>+ | +------+ V
# --->+ Split | | Trg | | Merge +--->+ Split | +-----------------------------------------+ +-------+ | Merge +----->+ MERT +--->
# | +--->+ Token +--->+ | | +--\ +------------------------------------------+ +--------+ | | ^ +------+
# +-------+ +-------+ +-------+ +-------+ \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+ | |
# Config: {tokeniser.[src|trg].language, +------------------------------------------+ +--------+ ^ +-------+ |
# tokeniser.[src|trg].tokeniser_dir Config: {irstlm_installation_dir::String, | |
# tokeniser.moses.installation} irstlm_smoothing_method::String, | |
# language_model_directory} | |
# | |
# {lm_filename, compiled_lm_filename, add_start_end_filename} |
# |
# {moses_ini_file, evaluation_data_filename, trg_language_model_filename,
# trg_language_model_order, trg_language_model_type}
#
component training_pipeline
inputs src_filename, trg_filename
output moses_ini_filename
configuration source_language,
target_language,
max_segment_length,
corpus_development_size,
corpus_evaluation_size,
alignment_method,
reordering_method,
smoothing_method,
tokenisation_directory,
translation_model_directory,
language_model_directory,
mert_directory,
moses_installation_directory,
giza_installation_directory,
irstlm_installation_directory
declare
tokeniser := new tokeniser with
source_language -> tokeniser.src.language,
target_language -> tokeniser.trg.language,
tokenisation_directory -> tokeniser.src.tokenisation_dir,
tokenisation_directory -> tokeniser.trg.tokenisation_dir,
moses_installation_directory -> tokeniser.moses.installation
model_training := new model_training with
max_segment_length -> model_training.max_segment_length,
corpus_development_size -> model_training.corpus.development_size,
corpus_evaluation_size -> model_training.corpus.evaluation_size,
translation_model_directory -> model_training.translation_model.dir,
alignment_method -> model_training.method.alignment,
reordering_method -> model_training.method.reordering,
source_language -> model_training.src.language,
moses_installation_directory -> model_training.moses.installation,
giza_installation_directory -> model_training.giza.installation,
target_language -> model_training.trg.language
irstlm := new lang_model with
irstlm_installation_directory -> irstlm_installation_dir,
smoothing_method -> irstlm_smoothing_method,
language_model_directory -> language_model_directory
mert := new mert with
source_language -> source_language,
target_language -> target_language,
moses_installation_directory -> moses_installation_dir,
mert_directory -> mert_working_directory
as
# Split and transform the input to the tokeniser component
# Inputs: src_filename, trg_filename
# Outputs: (tokenised_src_filename), (tokenised_trg_filename)
(wire src_filename -> src_filename,
trg_filename -> _ &&&
wire trg_filename -> trg_filename,
src_filename -> _) >>>
tokeniser >>>
# Merge output from tokeniser
# Inputs: (tokenised_src_filename), (tokenised_trg_filename)
# Outputs: tokenised_src_filename, tokenised_trg_filename
merge top[tokenised_src_filename] -> tokenised_src_filename,
bottom[tokenised_trg_filename] -> tokenised_trg_filename >>>
# Train the translation table and target language model
# Inputs: tokenised_src_filename, tokenised_trg_filename
# Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename')
((wire tokenised_src_filename -> src_filename,
tokenised_trg_filename -> trg_filename >>> model_training) &&&
(wire tokenised_trg_filename -> input_filename,
tokenised_src_filename -> _ >>> irstlm)) >>>
# Merge the output from the TT and LM training component
# Inputs: (moses_ini_filename, evaluation_data_filename),
# (compiled_lm_filename, add_start_end_filename, lm_filename)
# Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename,
# trg_language_model_filename, trg_language_model_order, trg_language_model_type
merge top[moses_ini_filename] -> moses_ini_filename,
top[evaluation_data_filename] -> evaluation_data_filename,
bottom[compiled_lm_filename] -> trg_language_model_filename,
bottom[add_start_end_filename] -> _,
bottom[lm_filename] -> _,
3 -> trg_language_model_order,
9 -> trg_language_model_type >>>
mert

@ -1 +0,0 @@
Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb

View File

@ -1,192 +0,0 @@
import logging
import os
from concurrent.futures import Future, ThreadPoolExecutor
from functools import partial
from pypeline.helpers.parallel_helpers import eval_pipeline, \
cons_function_component, \
cons_wire, \
cons_split_wire, \
cons_unsplit_wire, \
cons_dictionary_wire
#
# Some logging please
#
FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
logging.basicConfig(format = FORMAT, level = logging.DEBUG)
logger = logging.getLogger("manager")
# Build the pipeline components
def build_components(components, configuration, executor):
pipeline_components = dict()
pipeline_configuration = dict()
for component_id, module_name in components.items():
logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
module = __import__(module_name, fromlist = ['configure', 'initialise'])
# Component builds its own configuration object
config_func = getattr(module, 'configure')
component_config = config_func(configuration)
pipeline_configuration.update(component_config)
# Now build the component
init_func = getattr(module, 'initialise')
component_function = init_func(component_config)
# A wrapper for the component's function that submits to the executor
def get_component_function_wrapper(inner_function, comp_id, mod_name):
def component_function_wrapper(a, s):
logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
(comp_id, mod_name, a, s))
return inner_function(a, s)
return component_function_wrapper
# Arrowize the component
component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
# And store
pipeline_components[component_id] = component
return pipeline_components, pipeline_configuration
# Go!
def main(src_lang, trg_lang, src_filename, trg_filename):
# Global configuration
# One day, this configuration shall be constructed from
# command line options, or a properties file.
configuration = {
'moses_installation_dir': os.environ['MOSES_HOME'],
'irstlm_installation_dir': os.environ['IRSTLM'],
'giza_installation_dir': os.environ['GIZA_HOME'],
'src_lang': src_lang,
'src_tokenisation_dir': './tokenisation',
'trg_lang': trg_lang,
'trg_tokenisation_dir': './tokenisation',
'segment_length_limit': 60,
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': './language-model',
'translation_model_directory': './translation-model',
'mert_working_directory': './mert',
'evaluation_data_size': 100,
'development_data_size': 100
}
# The modules to load
# In the future, the components shall be specified in some kind
# pipeline description file.
component_modules = {
'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
'cleanup': 'training.components.cleanup.cleanup',
'data_split': 'training.components.data_split.data_split',
'irstlm_build': 'training.components.irstlm_build.irstlm_build',
'model_training': 'training.components.model_training.model_training',
'mert': 'training.components.mert.mert'
}
# The thread pool
executor = ThreadPoolExecutor(max_workers = 3)
# Phew, build the required components
components, component_config = build_components(component_modules, configuration, executor)
#
# Wire up components
# Description of wiring should be, in the future, alongside the component
# specification in some kind of confuguration file. Components shall be
# declared then used, i.e., bind a component instance to a unique component
# identifier, then wire component instances together by identifier.
#
#
# Tokenisation of source and target...
#
# IRSTLM Build components
irstlm_build_component = cons_split_wire() >> \
(cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
components['irstlm_build']).second() >> \
cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
'trg_language_model_filename': b['compiled_lm_filename']})
# The complete tokenisation component
tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
irstlm_build_component.second() >> \
cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
'trg_filename': b['tokenised_trg_filename'],
'trg_language_model_filename': b['trg_language_model_filename']})
#
# Cleanup and Data Spliting...
#
#
# A function that clips off the last '.' delimited string
#
def clip_last_bit(filename):
bn = os.path.basename(filename)
directory = os.path.dirname(filename)
bits = bn.split(".")
bits.pop()
return os.path.join(directory, ".".join(bits))
cleanup_datasplit_component = components['cleanup'] >> \
cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
'trg_filename': a['cleaned_trg_filename']}) >> \
components['data_split'] >> \
cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
'eval_src_filename': a['eval_src_filename'],
'eval_trg_filename': a['eval_trg_filename']})
#
# Translation model training
#
translation_model_component = cons_split_wire() >> \
components['model_training'].first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': b['eval_src_filename']})
#
# The whole pipeline
#
pipeline = tokenisation_component >> \
cons_split_wire() >> \
(cleanup_datasplit_component >> translation_model_component).first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': clip_last_bit(t['development_data_filename']),
'trg_language_model_filename': b['trg_language_model_filename'],
'trg_language_model_order': 3,
'trg_language_model_type': 9}) >> \
components['mert']
#
# The input to the pipeline
#
value = {'src_filename': src_filename,
'trg_filename': trg_filename}
#
# Evaluate the pipeline
#
logger.info("Evaluating pipeline with input [%s]..." % value)
new_value = eval_pipeline(executor, pipeline, value, component_config)
#
# Wait for all components to finish
#
executor.shutdown(True)
logger.info("Pipeline evaluated to %s" % new_value)
if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

@ -0,0 +1 @@
Subproject commit c5351ff941aeb9bb37a51f64228fad6f71ce4cab

View File

@ -1,11 +0,0 @@
import subprocess
def cat(filename, content):
fh = open(filename, "w")
for line in content:
#print(line, file=fh)
print >> fh, line
fh.close()
def diff(filename1, filename2):
subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)

View File

@ -1,125 +0,0 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits[-1] = "clean"
return ".".join(bits)
def _filter_main(value, config):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -1,109 +0,0 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print(l1, end='', file=ofh1)
print(l2, end='', file=ofh2)
def _filter_main(config, value):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
ifh1 = open(value['src_filename'], "r")
ifh2 = open(value['trg_filename'], "r")
ofh1 = open(value['cleaned_src_filename'], "w")
ofh2 = open(value['cleaned_trg_filename'], "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': value['cleaned_src_filename'],
'cleaned_trg_filename': value['cleaned_trg_filename']}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return cons_function_component(_filter_main)
if __name__ == '__main__':
import os
import tempfile
import training.components.shared.test as thelp
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_filename': src_filename[1] + ".clean",
'cleaned_trg_filename': trg_filename[1] + ".clean",
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
run_pipeline(box, box_config, box_eval)
thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -1,146 +0,0 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['evaluate_size'] = args['evaluation_data_size']
result['development_size'] = args['development_data_size']
return result
def initialise(config):
def _copy(size, inp, ofh1, ofh2):
try:
while size != 0:
(l1, l2) = inp.next()
print >>ofh1, l1,
print >>ofh2, l2,
size -= 1
except StopIteration:
pass
def _make_split_filename(filename, data_set):
bits = filename.split(".")
last = bits.pop()
lang_code = bits.pop()
bits.append(last)
bits.append(data_set)
bits.append(lang_code)
new_filename = ".".join(bits)
return new_filename
def _splitter_main(value, config):
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
inp = iter(zip(ifh1, ifh2))
result = {}
for (data_set, size) in [
('devel', config['development_size']),
('eval', config['evaluate_size']),
('train', -1)
]:
output_src_filename = _make_split_filename(input_src_filename, data_set)
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
ofh1 = open(output_src_filename, "w")
ofh2 = open(output_trg_filename, "w")
_copy(size, inp, ofh1, ofh2)
result[data_set + '_src_filename'] = output_src_filename
result[data_set + '_trg_filename'] = output_trg_filename
return result
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _splitter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {
'evaluation_data_size': 7,
'development_data_size': 13,
}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'devel_src_expected': src_filename[1] + ".devel.expected",
'devel_trg_expected': trg_filename[1] + ".devel.expected",
'eval_src_expected': src_filename[1] + ".eval.expected",
'eval_trg_expected': trg_filename[1] + ".eval.expected",
'train_src_expected': src_filename[1] + ".train.expected",
'train_trg_expected': trg_filename[1] + ".train.expected",
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
for data_set in ['devel', 'eval', 'train']:
for lang in ['src', 'trg']:
filename = output[data_set + '_' + lang + '_filename']
filename_expected = box_eval[data_set + '_' + lang + '_expected']
thelp.diff(filename_expected, filename)
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line(range(50)))
thelp.cat(box_eval['trg_filename'], _line(range(50)))
#expected output:
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -1,72 +0,0 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['external_bin_dir'] = args['giza_installation_dir']
result['model_directory'] = args['translation_model_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['training_data_filename'])
workdir = os.path.abspath(config['model_directory'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
src_lang = config['src_lang'].lower()
trg_lang = config['trg_lang'].lower()
external_bin = os.path.abspath(config['external_bin_dir'])
#create a dummy lm file
dummy_lmfile = workdir + os.sep + 'dummy.lm'
f = open(dummy_lmfile, 'w')
print >> f, "dummy lm file"
f.close()
logfile = workdir + os.sep + 'log'
#the command
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
if not os.path.exists(mosesini):
raise Exception, 'Failed training model'
return {'moses_ini_file':mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.environ['MOSES_HOME'],
'giza_installation_dir':os.environ['GIZA_HOME'],
'translation_model_directory':'model-dir'}
values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -1,43 +0,0 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['src_tokenisation_dir'] = args['src_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['src_filename']
outfilename = Tokenizer.batch_tokenise(
config['src_lang'],
config['moses_installation_dir'],
infilename,
config['src_tokenisation_dir'])
return {'tokenised_src_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'de',
'src_tokenisation_dir':'tmptok',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'src_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -1,3 +0,0 @@
asdfweoih
awfwoeijf awefo
what's this

View File

@ -1,36 +0,0 @@
#!/usr/bin/env python
import sys, os, subprocess
class Tokenizer:
@staticmethod
def batch_tokenise(lang, mosesdir, infilename, workdir):
print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
if not os.path.exists(workdir):
os.makedirs(workdir)
tok = Tokenizer(lang, mosesdir)
basefilename = os.path.basename(infilename)
outfilename = workdir + os.sep + basefilename + '.tok'
tok.file_tokenise(infilename, outfilename)
return outfilename
def __init__(self, lang, mosesdir):
self.arrows = None
self.lang = lang
#check the perl tokenizer is here
#path = os.path.dirname(os.path.abspath(__file__))
path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
self.perltok = path + os.sep + 'tokenizer.perl'
if not os.path.exists(path):
raise Exception, "Perl tokenizer does not exists"
def file_tokenise(self, infilename, outfilename):
cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
if __name__ == '__main__':
#do some test
pass

View File

@ -1,43 +0,0 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['trg_lang'] = args['trg_lang']
result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['trg_filename']
outfilename = Tokenizer.batch_tokenise(
config['trg_lang'],
config['moses_installation_dir'],
infilename,
config['trg_tokenisation_dir'])
return {'tokenised_trg_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'trg_lang':'de',
'trg_tokenisation_dir':'tmptoktrg',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'trg_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -43,13 +43,16 @@ int main(int argc, char** argv)
}
}
bool success = false;
if(inFilePath.empty()) {
std::cerr << "processing stdin to " << outFilePath << ".*\n";
return LexicalReorderingTableTree::Create(std::cin, outFilePath);
success = LexicalReorderingTableTree::Create(std::cin, outFilePath);
} else {
std::cerr << "processing " << inFilePath<< " to " << outFilePath << ".*\n";
InputFileStream file(inFilePath);
bool success = LexicalReorderingTableTree::Create(file, outFilePath);
return (success ? 0 : 1);
success = LexicalReorderingTableTree::Create(file, outFilePath);
}
return (success ? 0 : 1);
}