diff --git a/.gitmodules b/.gitmodules index e69de29bb..d3a8cb4da 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "contrib/arrow-pipelines/python/libs/pypeline"] + path = contrib/arrow-pipelines/python/libs/pypeline + url = git://github.com/ianj-als/pypeline.git diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt index 318956ccd..3dac64f60 100644 --- a/BUILD-INSTRUCTIONS.txt +++ b/BUILD-INSTRUCTIONS.txt @@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES Generally, for trouble installing external libraries, you should get support directly from the library maker: -Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html +Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user diff --git a/NOTICE b/NOTICE index 7d631cd88..23d8b2ad1 100644 --- a/NOTICE +++ b/NOTICE @@ -1,3 +1,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations). This code includes data from czech wiktionary (also czech abbreviations). + + diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia new file mode 100644 index 000000000..1d35a1dea Binary files /dev/null and b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia differ diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README new file mode 100644 index 000000000..e1e12975c --- /dev/null +++ b/contrib/arrow-pipelines/python/README @@ -0,0 +1,32 @@ +Arrow Based Moses Training Pipeline +=================================== + +To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command: + +$ git submodule init + +This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline: + +$ cd libs/pypeline +$ python setup.py install + +Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library. + +This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia. + +Three environment variables need to be set before the manager.py script can be run, they are: + + - MOSES_HOME : The directory where Moses has been cloned, or installed, + - IRSTLM : The installation directory of your IRSTLM, and + - GIZA_HOME : The installation directory of GIZA++. + +The manager.py script takes four positional command-line arguments: + + - The source language code, + - The target language code, + - The source corpus file. This file *must* be cleaned prior to use, and + - The target corpus file. This file *must* be cleaned prior to use. + +For example, run the manager.py script with: + +$ python manager.py en lt cleantrain.en cleantrain.lt diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline new file mode 160000 index 000000000..a7084b686 --- /dev/null +++ b/contrib/arrow-pipelines/python/libs/pypeline @@ -0,0 +1 @@ +Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py new file mode 100644 index 000000000..1c3ece111 --- /dev/null +++ b/contrib/arrow-pipelines/python/manager.py @@ -0,0 +1,192 @@ +import logging +import os + +from concurrent.futures import Future, ThreadPoolExecutor +from functools import partial +from pypeline.helpers.parallel_helpers import eval_pipeline, \ + cons_function_component, \ + cons_wire, \ + cons_split_wire, \ + cons_unsplit_wire, \ + cons_dictionary_wire + + +# +# Some logging please +# +FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s' +logging.basicConfig(format = FORMAT, level = logging.DEBUG) +logger = logging.getLogger("manager") + + +# Build the pipeline components +def build_components(components, configuration, executor): + pipeline_components = dict() + pipeline_configuration = dict() + + for component_id, module_name in components.items(): + logger.info("Loading [%s] component from [%s]..." % (component_id, module_name)) + + module = __import__(module_name, fromlist = ['configure', 'initialise']) + + # Component builds its own configuration object + config_func = getattr(module, 'configure') + component_config = config_func(configuration) + pipeline_configuration.update(component_config) + + # Now build the component + init_func = getattr(module, 'initialise') + component_function = init_func(component_config) + + # A wrapper for the component's function that submits to the executor + def get_component_function_wrapper(inner_function, comp_id, mod_name): + def component_function_wrapper(a, s): + logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \ + (comp_id, mod_name, a, s)) + return inner_function(a, s) + + return component_function_wrapper + + # Arrowize the component + component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name)) + + # And store + pipeline_components[component_id] = component + + return pipeline_components, pipeline_configuration + + +# Go! +def main(src_lang, trg_lang, src_filename, trg_filename): + # Global configuration + # One day, this configuration shall be constructed from + # command line options, or a properties file. + configuration = { + 'moses_installation_dir': os.environ['MOSES_HOME'], + 'irstlm_installation_dir': os.environ['IRSTLM'], + 'giza_installation_dir': os.environ['GIZA_HOME'], + 'src_lang': src_lang, + 'src_tokenisation_dir': './tokenisation', + 'trg_lang': trg_lang, + 'trg_tokenisation_dir': './tokenisation', + 'segment_length_limit': 60, + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': './language-model', + 'translation_model_directory': './translation-model', + 'mert_working_directory': './mert', + 'evaluation_data_size': 100, + 'development_data_size': 100 + } + + # The modules to load + # In the future, the components shall be specified in some kind + # pipeline description file. + component_modules = { + 'src_tokenizer': 'training.components.tokenizer.src_tokenizer', + 'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer', + 'cleanup': 'training.components.cleanup.cleanup', + 'data_split': 'training.components.data_split.data_split', + 'irstlm_build': 'training.components.irstlm_build.irstlm_build', + 'model_training': 'training.components.model_training.model_training', + 'mert': 'training.components.mert.mert' + } + + # The thread pool + executor = ThreadPoolExecutor(max_workers = 3) + + # Phew, build the required components + components, component_config = build_components(component_modules, configuration, executor) + + # + # Wire up components + # Description of wiring should be, in the future, alongside the component + # specification in some kind of confuguration file. Components shall be + # declared then used, i.e., bind a component instance to a unique component + # identifier, then wire component instances together by identifier. + # + + # + # Tokenisation of source and target... + # + # IRSTLM Build components + irstlm_build_component = cons_split_wire() >> \ + (cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \ + components['irstlm_build']).second() >> \ + cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'], + 'trg_language_model_filename': b['compiled_lm_filename']}) + + # The complete tokenisation component + tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \ + irstlm_build_component.second() >> \ + cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'], + 'trg_filename': b['tokenised_trg_filename'], + 'trg_language_model_filename': b['trg_language_model_filename']}) + + # + # Cleanup and Data Spliting... + # + + # + # A function that clips off the last '.' delimited string + # + def clip_last_bit(filename): + bn = os.path.basename(filename) + directory = os.path.dirname(filename) + bits = bn.split(".") + bits.pop() + return os.path.join(directory, ".".join(bits)) + + cleanup_datasplit_component = components['cleanup'] >> \ + cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'], + 'trg_filename': a['cleaned_trg_filename']}) >> \ + components['data_split'] >> \ + cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']), + 'eval_src_filename': a['eval_src_filename'], + 'eval_trg_filename': a['eval_trg_filename']}) + + # + # Translation model training + # + translation_model_component = cons_split_wire() >> \ + components['model_training'].first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': b['eval_src_filename']}) + + # + # The whole pipeline + # + pipeline = tokenisation_component >> \ + cons_split_wire() >> \ + (cleanup_datasplit_component >> translation_model_component).first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': clip_last_bit(t['development_data_filename']), + 'trg_language_model_filename': b['trg_language_model_filename'], + 'trg_language_model_order': 3, + 'trg_language_model_type': 9}) >> \ + components['mert'] + + + # + # The input to the pipeline + # + value = {'src_filename': src_filename, + 'trg_filename': trg_filename} + + # + # Evaluate the pipeline + # + logger.info("Evaluating pipeline with input [%s]..." % value) + new_value = eval_pipeline(executor, pipeline, value, component_config) + + # + # Wait for all components to finish + # + executor.shutdown(True) + + logger.info("Pipeline evaluated to %s" % new_value) + + +if __name__ == '__main__': + import sys + + main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py new file mode 100644 index 000000000..628796f7d --- /dev/null +++ b/contrib/arrow-pipelines/python/test/test.py @@ -0,0 +1,11 @@ +import subprocess + +def cat(filename, content): + fh = open(filename, "w") + for line in content: + #print(line, file=fh) + print >> fh, line + fh.close() + +def diff(filename1, filename2): + subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT) diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py new file mode 100644 index 000000000..cb2e057ce --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py @@ -0,0 +1,125 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print >>ofh1, l1, + print >>ofh2, l2, + + def _make_cleaned_filename(filename): + bits = filename.split(".") + bits[-1] = "clean" + return ".".join(bits) + + def _filter_main(value, config): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + + cleaned_src_filename = _make_cleaned_filename(input_src_filename) + cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) + ofh1 = open(cleaned_src_filename, "w") + ofh2 = open(cleaned_trg_filename, "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': cleaned_src_filename, + 'cleaned_trg_filename': cleaned_trg_filename} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _filter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + try: + thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) + finally: + os.unlink(output['cleaned_src_filename']) + os.unlink(output['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py new file mode 100644 index 000000000..27625c612 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py @@ -0,0 +1,109 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print(l1, end='', file=ofh1) + print(l2, end='', file=ofh2) + + def _filter_main(config, value): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + ifh1 = open(value['src_filename'], "r") + ifh2 = open(value['trg_filename'], "r") + ofh1 = open(value['cleaned_src_filename'], "w") + ofh2 = open(value['cleaned_trg_filename'], "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': value['cleaned_src_filename'], + 'cleaned_trg_filename': value['cleaned_trg_filename']} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return cons_function_component(_filter_main) + + +if __name__ == '__main__': + import os + import tempfile + import training.components.shared.test as thelp + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_filename': src_filename[1] + ".clean", + 'cleaned_trg_filename': trg_filename[1] + ".clean", + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + + run_pipeline(box, box_config, box_eval) + thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py new file mode 100644 index 000000000..b8469cbf6 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py @@ -0,0 +1,146 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['evaluate_size'] = args['evaluation_data_size'] + result['development_size'] = args['development_data_size'] + return result + +def initialise(config): + + def _copy(size, inp, ofh1, ofh2): + try: + while size != 0: + (l1, l2) = inp.next() + print >>ofh1, l1, + print >>ofh2, l2, + size -= 1 + except StopIteration: + pass + + def _make_split_filename(filename, data_set): + bits = filename.split(".") + last = bits.pop() + lang_code = bits.pop() + + bits.append(last) + bits.append(data_set) + bits.append(lang_code) + + new_filename = ".".join(bits) + return new_filename + + def _splitter_main(value, config): + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + inp = iter(zip(ifh1, ifh2)) + + result = {} + for (data_set, size) in [ + ('devel', config['development_size']), + ('eval', config['evaluate_size']), + ('train', -1) + ]: + output_src_filename = _make_split_filename(input_src_filename, data_set) + output_trg_filename = _make_split_filename(input_trg_filename, data_set) + ofh1 = open(output_src_filename, "w") + ofh2 = open(output_trg_filename, "w") + + _copy(size, inp, ofh1, ofh2) + result[data_set + '_src_filename'] = output_src_filename + result[data_set + '_trg_filename'] = output_trg_filename + + return result + + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _splitter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = { + 'evaluation_data_size': 7, + 'development_data_size': 13, + } + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'devel_src_expected': src_filename[1] + ".devel.expected", + 'devel_trg_expected': trg_filename[1] + ".devel.expected", + 'eval_src_expected': src_filename[1] + ".eval.expected", + 'eval_trg_expected': trg_filename[1] + ".eval.expected", + 'train_src_expected': src_filename[1] + ".train.expected", + 'train_trg_expected': trg_filename[1] + ".train.expected", + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + for data_set in ['devel', 'eval', 'train']: + for lang in ['src', 'trg']: + filename = output[data_set + '_' + lang + '_filename'] + filename_expected = box_eval[data_set + '_' + lang + '_expected'] + thelp.diff(filename_expected, filename) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line(range(50))) + thelp.cat(box_eval['trg_filename'], _line(range(50))) + #expected output: + thelp.cat(box_eval['devel_src_expected'], _line(range(0,13))) + thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13))) + thelp.cat(box_eval['eval_src_expected'], _line(range(13,20))) + thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20))) + thelp.cat(box_eval['train_src_expected'], _line(range(20,50))) + thelp.cat(box_eval['train_trg_expected'], _line(range(20,50))) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py new file mode 100644 index 000000000..f65d61973 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py @@ -0,0 +1,106 @@ +import os +import shutil +import subprocess +import tempfile + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + config = dict() + config['irstlm_install_directory'] = args['irstlm_installation_dir'] + config['smoothing_method'] = args['irstlm_smoothing_method'] + config['lm_directory'] = args['language_model_directory'] + return config + +def initialise(config): + def process(a, s): + # Create the LM directory if we need to + if os.path.exists(s['lm_directory']) is False: + os.makedirs(s['lm_directory']) + + # The filename of the file to chew through + start_end_input_filename = a['input_filename'] + if os.path.exists(start_end_input_filename) is False: + raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename) + + # Derive the output file name for the add start-end marker processor + filename_bits = os.path.basename(start_end_input_filename).split(".") + filename_bits[2] = "sb"; + start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the output file name of the LM build + filename_bits[2] = "lm" + lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the compiled LM file name + filename_bits[2] = "arpa" + compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # First thing to do is add start and end markers + start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")] + infile = open(start_end_input_filename, 'r') + outfile = open(start_end_output_filename, 'w') + print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline) + return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile) + if return_code: + raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \ + start_end_input_filename, start_end_output_filename, return_code) + + # Next build the language model + tmp_dir = tempfile.mkdtemp(dir = "/tmp") + try: + build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"), + "-i", start_end_output_filename, + "-t", tmp_dir, + "-p", + "-s", s['smoothing_method'], + "-o", lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline) + return_code = subprocess.check_call(build_lm_cmdline) + if return_code: + raise Exception("IRST language model failed to build: return code = [%d]" % return_code) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Compile the LM + lm_filename = lm_filename + ".gz" + compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"), + "--text", "yes", + lm_filename, + compiled_lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline) + return_code = subprocess.check_call(compile_lm_cmdline) + if return_code: + raise Exception("IRST language model compilation failed: return code = [%d]" % return_code) + + output = {'add_start_end_filename': start_end_output_filename, + 'lm_filename': lm_filename, + 'compiled_lm_filename': compiled_lm_filename} + + print "IRSTLM Build: Output = %s" % output + + return output + + return process + + +if __name__ == '__main__': + from pypeline.helpers.helpers import eval_pipeline + + lm_dir = os.environ["PWD"] + configuration = {'irstlm_root': os.environ["IRSTLM"], + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': lm_dir} + component_config = configure(configuration) + component = initialise(component_config) + + value = eval_pipeline(component, + {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'}, + component_config) + target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'), + 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'), + 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')} + print "Target: %s" % target + if value != target: + raise Exception("Massive fail!") diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py new file mode 100755 index 000000000..2b60b1720 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['mert_working_dir'] = args['mert_working_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['development_data_filename']) + lm_file = os.path.abspath(a['trg_language_model_filename']) + lm_order = int(a['trg_language_model_order']) + lm_type = int(a['trg_language_model_type']) + orig_moses_ini = os.path.abspath(a['moses_ini_file']) + + if not os.path.exists(orig_moses_ini): + raise Exception, "Error: Input moses.ini does not exist" + + workdir = os.path.abspath(config['mert_working_dir']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + moses_install_dir = os.path.abspath(config['moses_installation_dir']) + mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl') + bin_dir = os.path.join(moses_install_dir, 'bin') + moses_bin = os.path.join(moses_install_dir, 'bin', 'moses') + src_file = infilename + '.' + config['src_lang'] + ref_file = infilename + '.' + config['trg_lang'] + logfile = os.path.join(workdir, 'log') + #change lm configuration in moses ini + moses_ini = os.path.join(workdir, 'trained-moses.ini') + cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s" + cmd = cmd % locals() + os.system(cmd) + + #the command + cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + new_mosesini = os.path.join(workdir, 'moses.ini') + if not os.path.exists(new_mosesini): + raise Exception, 'Failed MERT' + + return {'moses_ini_file':new_mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.path.abspath('../../../../'), + 'mert_working_dir':'../../../../../tuning'} + values = {'development_data_filename':'../../../../../corpus/tune', + 'moses_ini_file':'../../../../../model/model/moses.ini', + 'trg_language_model_filename':'../../../../../corpus/train.lt.lm', + 'trg_language_model_type':9, + 'trg_language_model_order':4} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py new file mode 100755 index 000000000..e990307d2 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['external_bin_dir'] = args['giza_installation_dir'] + result['model_directory'] = args['translation_model_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['training_data_filename']) + workdir = os.path.abspath(config['model_directory']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl' + src_lang = config['src_lang'].lower() + trg_lang = config['trg_lang'].lower() + external_bin = os.path.abspath(config['external_bin_dir']) + #create a dummy lm file + dummy_lmfile = workdir + os.sep + 'dummy.lm' + f = open(dummy_lmfile, 'w') + print >> f, "dummy lm file" + f.close() + logfile = workdir + os.sep + 'log' + + #the command + cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s' + + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini' + if not os.path.exists(mosesini): + raise Exception, 'Failed training model' + + return {'moses_ini_file':mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.environ['MOSES_HOME'], + 'giza_installation_dir':os.environ['GIZA_HOME'], + 'translation_model_directory':'model-dir'} + values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py new file mode 100755 index 000000000..57f8771df --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['src_tokenisation_dir'] = args['src_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['src_filename'] + outfilename = Tokenizer.batch_tokenise( + config['src_lang'], + config['moses_installation_dir'], + infilename, + config['src_tokenisation_dir']) + return {'tokenised_src_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'de', + 'src_tokenisation_dir':'tmptok', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'src_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de new file mode 100644 index 000000000..c6b41edbe --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de @@ -0,0 +1,3 @@ +asdfweoih +awfwoeijf awefo +what's this diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py new file mode 100644 index 000000000..354ec1abc --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import sys, os, subprocess + +class Tokenizer: + + @staticmethod + def batch_tokenise(lang, mosesdir, infilename, workdir): + print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir) + if not os.path.exists(workdir): + os.makedirs(workdir) + tok = Tokenizer(lang, mosesdir) + basefilename = os.path.basename(infilename) + outfilename = workdir + os.sep + basefilename + '.tok' + tok.file_tokenise(infilename, outfilename) + return outfilename + + def __init__(self, lang, mosesdir): + self.arrows = None + self.lang = lang + #check the perl tokenizer is here + #path = os.path.dirname(os.path.abspath(__file__)) + path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer' + self.perltok = path + os.sep + 'tokenizer.perl' + if not os.path.exists(path): + raise Exception, "Perl tokenizer does not exists" + + def file_tokenise(self, infilename, outfilename): + cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename) + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + +if __name__ == '__main__': + #do some test + pass + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py new file mode 100755 index 000000000..3852e296f --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['trg_lang'] = args['trg_lang'] + result['trg_tokenisation_dir'] = args['trg_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['trg_filename'] + outfilename = Tokenizer.batch_tokenise( + config['trg_lang'], + config['moses_installation_dir'], + infilename, + config['trg_tokenisation_dir']) + return {'tokenised_trg_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'trg_lang':'de', + 'trg_tokenisation_dir':'tmptoktrg', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'trg_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index e135b8886..f551380fd 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -24,7 +24,7 @@ - + @@ -133,8 +133,13 @@ - - + + + + + + + diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject index 7529a7799..fc08b4c3d 100644 --- a/contrib/other-builds/extractor/.cproject +++ b/contrib/other-builds/extractor/.cproject @@ -18,11 +18,14 @@ - + @@ -119,5 +122,13 @@ - + + + + + + + + + diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject index 2036e6b18..e3e47fd7e 100644 --- a/contrib/other-builds/lm/.cproject +++ b/contrib/other-builds/lm/.cproject @@ -24,7 +24,7 @@ - + @@ -131,7 +131,14 @@ - + + + + + + + + diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project index e75388ac1..a1bde37c2 100644 --- a/contrib/other-builds/lm/.project +++ b/contrib/other-builds/lm/.project @@ -141,11 +141,6 @@ 1 PARENT-3-PROJECT_LOC/lm/build_binary - - build_binary.cc - 1 - PARENT-3-PROJECT_LOC/lm/build_binary.cc - clean.sh 1 @@ -176,11 +171,6 @@ 1 PARENT-3-PROJECT_LOC/lm/facade.hh - - fragment.cc - 1 - PARENT-3-PROJECT_LOC/lm/fragment.cc - left.hh 1 @@ -211,11 +201,6 @@ 1 PARENT-3-PROJECT_LOC/lm/lm_exception.hh - - max_order.cc - 1 - PARENT-3-PROJECT_LOC/lm/max_order.cc - max_order.hh 1 @@ -241,11 +226,6 @@ 1 PARENT-3-PROJECT_LOC/lm/model_type.hh - - ngram_query.cc - 1 - PARENT-3-PROJECT_LOC/lm/ngram_query.cc - ngram_query.hh 1 diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index 41a471cd1..e1c19b822 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -7,7 +7,7 @@ - + @@ -23,13 +23,14 @@ - + @@ -45,11 +46,8 @@ - - - - + @@ -61,7 +59,7 @@ - + @@ -119,5 +117,13 @@ - + + + + + + + + + diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index fedda926b..90a730cf7 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -19,7 +19,7 @@ - + - - + + + + + + + diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 10b6784d4..573fe715f 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -19,7 +19,7 @@ - + - - + + + + + + + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index e54a1385b..787024533 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -1,7 +1,5 @@ - - - + @@ -9,7 +7,7 @@ - + @@ -26,7 +24,7 @@ - + - - + + + + + + + + diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject index 9ccb8f8e9..2de36fecd 100644 --- a/contrib/other-builds/search/.cproject +++ b/contrib/other-builds/search/.cproject @@ -24,7 +24,7 @@ - + - + + + + + + + + diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project index efad842ea..95f074aae 100644 --- a/contrib/other-builds/search/.project +++ b/contrib/other-builds/search/.project @@ -156,11 +156,6 @@ 1 PARENT-3-PROJECT_LOC/search/vertex.hh - - vertex_generator.cc - 1 - PARENT-3-PROJECT_LOC/search/vertex_generator.cc - vertex_generator.hh 1 diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject index ab37362a4..2fd4d2dfb 100644 --- a/contrib/other-builds/util/.cproject +++ b/contrib/other-builds/util/.cproject @@ -24,7 +24,7 @@ - + @@ -136,8 +136,13 @@ - - + + + + + + + diff --git a/contrib/rpm/README b/contrib/rpm/README new file mode 100644 index 000000000..8ba7ef4da --- /dev/null +++ b/contrib/rpm/README @@ -0,0 +1,42 @@ +Building Moses RPM +================== + +*** WARNING *** +Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer. +*** WARNING *** + + +Building the RPM SPEC file +-------------------------- + +The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information: + + - The Git repository from which an installer will be built, + - The branch in the Git repository to build, and + - The version of the installed Moses distribution. + +For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git): + +$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 + +This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS. + + +Building the RPM +---------------- + +Change directory to $HOME/rpmbuild, and build the binary RPM with: + +$ rpmbuild -bb SPECS/moses.spec + +This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS//moses--1..rpm. + +For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm. + + +Building a Debian package +------------------------- + +The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page: + +https://help.ubuntu.com/community/RPM/AlienHowto diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh new file mode 100755 index 000000000..d0fac6a33 --- /dev/null +++ b/contrib/rpm/build_source.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +BRANCH="master" +declare -i NO_RPM_BUILD=0 +declare -r RPM_VERSION_TAG="___RPM_VERSION__" + +function usage() { + echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]" + exit 1 +} + +if [ $# -lt 4 ]; then + usage +fi + +while getopts r:b:v:nh OPTION +do + case "$OPTION" in + r) REPO="${OPTARG}";; + b) BRANCH="${OPTARG}";; + v) VERSION="${OPTARG}";; + n) NO_RPM_BUILD=1;; + [h\?]) usage;; + esac +done + +if [ ! -d ./rpmbuild ]; then + echo "RPM build directory not in current working direcotry" + exit 1 +fi + +declare -r MOSES_DIR="moses-${VERSION}" +git clone ${REPO} ${MOSES_DIR} +if [ $? -ne 0 ]; then + echo "Failed to clone Git repository ${REPO}" + exit 3 +fi + +cd ${MOSES_DIR} + +git checkout ${BRANCH} +if [ $? -ne 0 ]; then + echo "Failed to checkout branch ${BRANCH}" + exit 3 +fi + +cd .. + +tar -cf moses-${VERSION}.tar ${MOSES_DIR} +gzip -f9 moses-${VERSION}.tar + +if [ ${NO_RPM_BUILD} -eq 0 ]; then + if [ ! -d ${HOME}/rpmbuild/SPECS ]; then + mkdir -p ${HOME}/rpmbuild/SPECS + fi + eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec + if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then + mkdir -p ${HOME}/rpmbuild/SOURCES + fi + mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES +fi + +rm -Rf ${MOSES_DIR} diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec new file mode 100644 index 000000000..0f4a6c6ec --- /dev/null +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -0,0 +1,65 @@ +Name: moses +Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. +Version: ___RPM_VERSION__ +Release: 1 +URL: http://www.statmt.org/moses/ +Source0: %{name}-%{version}.tar.gz +License: LGPL +Group: Development/Tools +Vendor: Capita Translation and Interpreting +Packager: Ian Johnson +Requires: boost >= 1.48, python >= 2.6, perl >= 5 +BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release} +%description +Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices. +%prep +%setup -q + +mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 + +wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz +wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz + +cd $RPM_BUILD_DIR + +tar -zxf irstlm-5.70.04.tgz +tar -zxf giza-pp-v1.0.7.tgz + +cd irstlm-5.70.04 +bash regenerate-makefiles.sh --force +./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 +make +make install + +cd ../giza-pp +make +cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 +%build +./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2 +%install +mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts +cp -R bin $RPM_BUILD_ROOT/opt/moses +cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts +%clean +%files +%defattr(-,root,root) +/opt/moses/bin/* +/opt/moses/scripts/analysis/* +/opt/moses/scripts/ems/* +/opt/moses/scripts/generic/* +/opt/moses/scripts/other/* +/opt/moses/scripts/recaser/* +/opt/moses/scripts/regression-testing/* +/opt/moses/scripts/share/* +/opt/moses/scripts/tokenizer/* +/opt/moses/scripts/training/* +/opt/moses/irstlm-5.70.04/* +/opt/moses/giza++-v1.0.7/* diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp index 09e06fcf6..b65873881 100644 --- a/moses-chart-cmd/IOWrapper.cpp +++ b/moses-chart-cmd/IOWrapper.cpp @@ -620,12 +620,29 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size) template void ShiftOffsets(vector &offsets, T shift) { + T currPos = shift; for (size_t i = 0; i < offsets.size(); ++i) { - shift += offsets[i]; - offsets[i] += shift; + if (offsets[i] == 0) { + offsets[i] = currPos; + ++currPos; + } + else { + currPos += offsets[i]; + } } } +size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) +{ + size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered(); + const std::vector &prevHypos = hypo->GetPrevHypos(); + for (size_t i = 0; i < prevHypos.size(); ++i) { + size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered(); + ret -= (childSize - 1); + } + return ret; +} + size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget) { const ChartHypothesis *hypo = &node.GetHypothesis(); @@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); - vector sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); + size_t thisSourceSize = CalcSourceSize(hypo); + + // position of each terminal word in translation rule, irrespective of alignment + // if non-term, number is undefined + vector sourceOffsets(thisSourceSize, 0); vector targetOffsets(tp.GetSize(), 0); const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren(); @@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT const ChartTrellisNode &prevNode = *prevNodes[sourceInd]; - // 1st. calc source size + // calc source size size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered(); sourceOffsets[sourcePos] = sourceSize; - // 2nd. calc target size. Recursively look thru child hypos + // calc target size. + // Recursively look thru child hypos size_t currStartTarget = startTarget + totalTargetSize; size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget); targetOffsets[targetPos] = targetSize; @@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT } } - // 3rd. shift offsets + // convert position within translation rule to absolute position within + // source sentence / output sentence ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(targetOffsets, startTarget); // get alignments from this hypo - vector< set > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered()); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); - OutputAlignment(retAlignmentsS2T, aiTerm); // add to output arg, offsetting by source & target - for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { - const set &targets = retAlignmentsS2T[source]; - set::const_iterator iter; - for (iter = targets.begin(); iter != targets.end(); ++iter) { - size_t target = *iter; - pair alignPoint(source + sourceOffsets[source] - ,target + targetOffsets[target]); - pair ret = retAlign.insert(alignPoint); - CHECK(ret.second); + AlignmentInfo::const_iterator iter; + for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) { + const std::pair &align = *iter; + size_t relSource = align.first; + size_t relTarget = align.second; + size_t absSource = sourceOffsets[relSource]; + size_t absTarget = targetOffsets[relTarget]; - } + pair alignPoint(absSource, absTarget); + pair ret = retAlign.insert(alignPoint); + CHECK(ret.second); } return totalTargetSize; @@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe { ostringstream out; - Alignments retAlign; - OutputAlignment(retAlign, hypo, 0); + if (hypo) { + Alignments retAlign; + OutputAlignment(retAlign, hypo, 0); - // output alignments - Alignments::const_iterator iter; - for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { - const pair &alignPoint = *iter; - out << alignPoint.first << "-" << alignPoint.second << " "; + // output alignments + Alignments::const_iterator iter; + for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { + const pair &alignPoint = *iter; + out << alignPoint.first << "-" << alignPoint.second << " "; + } } out << endl; @@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); - vector sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); + size_t thisSourceSize = CalcSourceSize(hypo); + + // position of each terminal word in translation rule, irrespective of alignment + // if non-term, number is undefined + vector sourceOffsets(thisSourceSize, 0); vector targetOffsets(tp.GetSize(), 0); const vector &prevHypos = hypo->GetPrevHypos(); @@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth const ChartHypothesis *prevHypo = prevHypos[sourceInd]; - // 1st. calc source size + // calc source size size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered(); sourceOffsets[sourcePos] = sourceSize; - // 2nd. calc target size. Recursively look thru child hypos + // calc target size. + // Recursively look thru child hypos size_t currStartTarget = startTarget + totalTargetSize; size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget); targetOffsets[targetPos] = targetSize; @@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth } } - // 3rd. shift offsets + // convert position within translation rule to absolute position within + // source sentence / output sentence ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(targetOffsets, startTarget); // get alignments from this hypo - vector< set > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered()); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); - OutputAlignment(retAlignmentsS2T, aiTerm); // add to output arg, offsetting by source & target - for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { - const set &targets = retAlignmentsS2T[source]; - set::const_iterator iter; - for (iter = targets.begin(); iter != targets.end(); ++iter) { - size_t target = *iter; - pair alignPoint(source + sourceOffsets[source] - ,target + targetOffsets[target]); - pair ret = retAlign.insert(alignPoint); - CHECK(ret.second); + AlignmentInfo::const_iterator iter; + for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) { + const std::pair &align = *iter; + size_t relSource = align.first; + size_t relTarget = align.second; + size_t absSource = sourceOffsets[relSource]; + size_t absTarget = targetOffsets[relTarget]; + + pair alignPoint(absSource, absTarget); + pair ret = retAlign.insert(alignPoint); + CHECK(ret.second); - } } return totalTargetSize; diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index f11516839..2da30f380 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType) } } + ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() { + const StaticData &staticData = StaticData::Instance(); + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-hypergraph")[1]; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + return file; + } + /*** * print surface factor only for the given phrase */ @@ -262,6 +271,19 @@ void OutputAlignment(ostream &out, const vector &edges) out << std::endl; } +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo) +{ + std::vector edges; + const Hypothesis *currentHypo = hypo; + while (currentHypo) { + edges.push_back(currentHypo); + currentHypo = currentHypo->GetPrevHypo(); + } + + OutputAlignment(out, edges); + +} + void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector &edges) { ostringstream out; diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 8f164dfb3..267a3a0bc 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -117,6 +117,8 @@ public: return *m_outputSearchGraphStream; } + std::ofstream *GetOutputSearchGraphHypergraphWeightsStream(); + std::ostream &GetDetailedTranslationReportingStream() { assert (m_detailedTranslationReportingStream); return *m_detailedTranslationReportingStream; @@ -137,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path); - +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo); } diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index ac4527aae..117cac3f9 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -83,14 +83,18 @@ public: OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector, OutputCollector* detailedTranslationCollector, OutputCollector* alignmentInfoCollector, - OutputCollector* unknownsCollector) : + OutputCollector* unknownsCollector, + bool outputSearchGraphSLF, + bool outputSearchGraphHypergraph) : m_source(source), m_lineNumber(lineNumber), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_latticeSamplesCollector(latticeSamplesCollector), m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector), m_detailedTranslationCollector(detailedTranslationCollector), m_alignmentInfoCollector(alignmentInfoCollector), - m_unknownsCollector(unknownsCollector) {} + m_unknownsCollector(unknownsCollector), + m_outputSearchGraphSLF(outputSearchGraphSLF), + m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {} /** Translate one sentence * gets called by main function implemented at end of this source file */ @@ -143,6 +147,42 @@ public: #endif } + // Output search graph in HTK standard lattice format (SLF) + if (m_outputSearchGraphSLF) { + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf"; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + if (file->is_open() && file->good()) { + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsSLF(m_lineNumber, out); + *file << out.str(); + file -> flush(); + } else { + TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + } + } + + // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder + if (m_outputSearchGraphHypergraph) { + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + if (file->is_open() && file->good()) { + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, out); + *file << out.str(); + file -> flush(); + } else { + TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + } + file -> close(); + delete file; + } + // apply decision rule and output best translation(s) if (m_outputCollector) { ostringstream out; @@ -157,7 +197,7 @@ public: // MAP decoding: best hypothesis const Hypothesis* bestHypo = NULL; if (!staticData.UseMBR()) - { + { bestHypo = manager.GetBestHypothesis(); if (bestHypo) { if (staticData.IsPathRecoveryEnabled()) { @@ -174,13 +214,18 @@ public: staticData.GetOutputFactorOrder(), staticData.GetReportSegmentation(), staticData.GetReportAllFactors()); + if (staticData.PrintAlignmentInfo()) { + out << "||| "; + OutputAlignment(out, bestHypo); + } + OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo); IFVERBOSE(1) { debug << "BEST TRANSLATION: " << *bestHypo << endl; } } out << endl; - } + } // MBR decoding (n-best MBR, lattice MBR, consensus) else @@ -311,6 +356,8 @@ private: OutputCollector* m_detailedTranslationCollector; OutputCollector* m_alignmentInfoCollector; OutputCollector* m_unknownsCollector; + bool m_outputSearchGraphSLF; + bool m_outputSearchGraphHypergraph; std::ofstream *m_alignmentStream; @@ -367,6 +414,63 @@ static void ShowWeights() } +size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) +{ + size_t numScoreComps = ff->GetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << i + << "=" << values[i] << endl; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << "=" << values[0] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; + assert(false); + return 0; + } +} + +void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iLoadParam(argc,argv)) { + Parameter params; + if (!params.LoadParam(argc,argv)) { exit(1); } // initialize all "global" variables, which are stored in StaticData // note: this also loads models such as the language model, etc. - if (!StaticData::LoadDataStatic(params, argv[0])) { + if (!StaticData::LoadDataStatic(¶ms, argv[0])) { exit(1); } // setting "-show-weights" -> just dump out weights and exit - if (params->isParamSpecified("show-weights")) { + if (params.isParamSpecified("show-weights")) { ShowWeights(); exit(0); } @@ -430,6 +534,14 @@ int main(int argc, char** argv) TRACE_ERR(weights); TRACE_ERR("\n"); } + if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) { + ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream(); + OutputFeatureWeightsForHypergraph(*weightsOut); + weightsOut->flush(); + weightsOut->close(); + delete weightsOut; + } + // initialize output streams // note: we can't just write to STDOUT or files @@ -533,7 +645,9 @@ int main(int argc, char** argv) searchGraphCollector.get(), detailedTranslationCollector.get(), alignmentInfoCollector.get(), - unknownsCollector.get() ); + unknownsCollector.get(), + staticData.GetOutputSearchGraphSLF(), + staticData.GetOutputSearchGraphHypergraph()); // execute task #ifdef WITH_THREADS pool.Submit(task); @@ -551,6 +665,8 @@ int main(int argc, char** argv) pool.Stop(true); //flush remaining jobs #endif + delete ioWrapper; + } catch (const std::exception &e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp index 5daba9ba1..53b83d8cd 100644 --- a/moses/AlignmentInfoCollection.cpp +++ b/moses/AlignmentInfoCollection.cpp @@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection() m_emptyAlignmentInfo = Add(pairs); } +AlignmentInfoCollection::~AlignmentInfoCollection() +{} + const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const { return *m_emptyAlignmentInfo; diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h index 9c7f75e13..de0949f8f 100644 --- a/moses/AlignmentInfoCollection.h +++ b/moses/AlignmentInfoCollection.h @@ -55,6 +55,7 @@ class AlignmentInfoCollection //! Only a single static variable should be created. AlignmentInfoCollection(); + ~AlignmentInfoCollection(); static AlignmentInfoCollection s_instance; diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 506193d5b..5bd3a4e2b 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList() */ const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.GetNBestSize(); - bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ; + bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ; if (!distinctNBest && m_arcList->size() > nBestSize * 5) { // prune arc list only if there too many arcs diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp index 3418aefe2..c061d0fed 100644 --- a/moses/LM/SingleFactor.cpp +++ b/moses/LM/SingleFactor.cpp @@ -36,8 +36,9 @@ using namespace std; namespace Moses { -LanguageModelSingleFactor::~LanguageModelSingleFactor() {} - +LanguageModelSingleFactor::~LanguageModelSingleFactor() +{ +} struct PointerState : public FFState { const void* lmstate; @@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState() m_beginSentenceState = new PointerState(NULL); } -LanguageModelPointerState::~LanguageModelPointerState() {} +LanguageModelPointerState::~LanguageModelPointerState() +{ + delete m_nullContextState; + delete m_beginSentenceState; +} const FFState *LanguageModelPointerState::GetNullContextState() const { diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 468db0de3..011187cda 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #endif #include -#include #include +#include +#include +#include #include "Manager.h" #include "TypeDef.h" #include "Util.h" @@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "rule.pb.h" #endif +#include "util/exception.hh" + using namespace std; namespace Moses { Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system) - :m_lineNumber(lineNumber) - ,m_system(system) + :m_system(system) ,m_transOptColl(source.CreateTranslationOptionCollection(system)) ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl)) ,interrupted_flag(0) ,m_hypoId(0) + ,m_lineNumber(lineNumber) ,m_source(source) { m_system->InitializeBeforeSentenceProcessing(source); @@ -628,6 +632,420 @@ void Manager::GetSearchGraph(vector& searchGraph) const } +void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iGetScoreBreakdown(); + // outputSearchGraphStream << scoreCollection << endl; + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iGetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() + << " " << ff->GetScoreProducerWeightShortName() + << " " << (i+1) << " of " << numScoreComps << endl + << "x" << (index+i) << "scale=" << values[i] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + assert(false); + return 0; + } +} + +size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + + // { const FeatureFunction* sp = ff; + // const FVector& m_scores = scoreCollection.GetScoresVector(); + // FVector& scores = const_cast(m_scores); + // std::string prefix = sp->GetScoreProducerDescription() + FName::SEP; + // // std::cout << "prefix==" << prefix << endl; + // // cout << "m_scores==" << m_scores << endl; + // // cout << "m_scores.size()==" << m_scores.size() << endl; + // // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl; + // // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl; + + + // // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { + // // std::cout<first) << "\t" << (i->second) << std::endl; + // // } + // for(int i=0, n=v.size(); iGetScoreBreakdown(); + + vector featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size();//featureValues.coreSize(); + // if (numScoreComps != ScoreProducer::unlimited) { + // vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " "; + } + return index+numScoreComps; + // } else { + // cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + // assert(false); + // return 0; + // } +} + +size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + + ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); + const Hypothesis *prevHypo = hypo->GetPrevHypo(); + if (prevHypo) { + scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() ); + } + vector featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size(); + + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " "; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " "; + } + + return index+numScoreComps; +} + +/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */ +void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const +{ + vector searchGraph; + GetSearchGraph(searchGraph); + + map mosesIDToHypergraphID; + // map hypergraphIDToMosesID; + set terminalNodes; + multimap hypergraphIDToArcs; + + long numNodes = 0; + long endNode = 0; + { + long hypergraphHypothesisID = 0; + for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { + + // Get an id number for the previous hypothesis + const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo(); + if (prevHypo!=NULL) { + int mosesPrevHypothesisID = prevHypo->GetId(); + if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) { + mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID; + hypergraphHypothesisID += 1; + } + } + + // Get an id number for this hypothesis + int mosesHypothesisID; + if (searchGraph[arcNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); + } + + if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) { + + mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + // Final arc to end node, representing the end of the sentence + terminalNodes.insert(hypergraphHypothesisID); + } + + hypergraphHypothesisID += 1; + } + + // Record that this arc ends at this node + hypergraphIDToArcs.insert(pair(mosesIDToHypergraphID[mosesHypothesisID],arcNumber)); + + } + + // Unique end node + endNode = hypergraphHypothesisID; + // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID; + numNodes = endNode + 1; + + } + + + long numArcs = searchGraph.size() + terminalNodes.size(); + + // Print number of nodes and arcs + outputSearchGraphStream << numNodes << " " << numArcs << endl; + + for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) { + // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID]; + size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID); + if (count > 0) { + outputSearchGraphStream << count << endl; + + pair::iterator, multimap::iterator> range = + hypergraphIDToArcs.equal_range(hypergraphHypothesisID); + for (multimap::iterator it=range.first; it!=range.second; ++it) { + int lineNumber = (*it).second; + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + int mosesHypothesisID;// = thisHypo->GetId(); + if (searchGraph[lineNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[lineNumber].hypo->GetId(); + } + // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; + UTIL_THROW_IF( + (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), + util::Exception, + "Error while writing search lattice as hypergraph for sentence " << translationId << ". " << + "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID << + ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << + ". There are " << numNodes << " nodes in the search lattice." + ); + + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo==NULL) { + outputSearchGraphStream << " ||| " << endl; + } else { + int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; + + UTIL_THROW_IF( + (startNode >= hypergraphHypothesisID), + util::Exception, + "Error while writing search lattice as hypergraph for sentence" << translationId << ". " << + "The nodes must be output in topological order. The code attempted to violate this restriction." + ); + + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + outputSearchGraphStream << "[" << startNode << "]"; + for (int targetWordIndex=0; targetWordIndex + outputSearchGraphStream << terminalNodes.size() << endl; + for (set::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) { + outputSearchGraphStream << "[" << (*it) << "] ||| " << endl; + } + +} + + +/**! Output search graph in HTK standard lattice format (SLF) */ +void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const +{ + + vector searchGraph; + GetSearchGraph(searchGraph); + + long numArcs = 0; + long numNodes = 0; + + map nodes; + set terminalNodes; + + // Unique start node + nodes[0] = 0; + + for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { + + int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize(); + numArcs += targetWordCount; + + int hypothesisID = searchGraph[arcNumber].hypo->GetId(); + if (nodes.count(hypothesisID) == 0) { + + numNodes += targetWordCount; + nodes[hypothesisID] = numNodes; + //numNodes += 1; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + numArcs += 1; + } + } + + } + numNodes += 1; + + // Unique end node + nodes[numNodes] = numNodes; + + outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl; + outputSearchGraphStream << "VERSION=1.1" << endl; + outputSearchGraphStream << "base=2.71828182845905" << endl; + outputSearchGraphStream << "NODES=" << (numNodes+1) << endl; + outputSearchGraphStream << "LINKS=" << numArcs << endl; + + OutputFeatureWeightsForSLF(outputSearchGraphStream); + + for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) { + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo) { + + int startNode = nodes[prevHypo->GetId()]; + int endNode = nodes[thisHypo->GetId()]; + bool terminalNode = (searchGraph[lineNumber].forward == -1); + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + for (int targetWordIndex=0; targetWordIndexGetParam("sort-word-alignment").size()) { - m_wordAlignmentSort = (WordAlignmentSort) Scan(m_parameter->GetParam("sort-word-alignment")[0]); - } - // factor delimiter if (m_parameter->GetParam("factor-delimiter").size() > 0) { m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; @@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter) SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false ); //word-to-word alignment + // alignments + SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false ); + if (m_PrintAlignmentInfo) { + m_needAlignmentInfo = true; + } + + if(m_parameter->GetParam("sort-word-alignment").size()) { + m_wordAlignmentSort = (WordAlignmentSort) Scan(m_parameter->GetParam("sort-word-alignment")[0]); + } + SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false ); if (m_PrintAlignmentInfoNbest) { m_needAlignmentInfo = true; @@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter) } m_outputSearchGraph = true; m_outputSearchGraphExtended = true; - } else + } else { m_outputSearchGraph = false; + } + if (m_parameter->GetParam("output-search-graph-slf").size() > 0) { + m_outputSearchGraphSLF = true; + } else { + m_outputSearchGraphSLF = false; + } + if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) { + m_outputSearchGraphHypergraph = true; + } else { + m_outputSearchGraphHypergraph = false; + } #ifdef HAVE_PROTOBUF if (m_parameter->GetParam("output-search-graph-pb").size() > 0) { if (m_parameter->GetParam("output-search-graph-pb").size() != 1) { diff --git a/moses/StaticData.h b/moses/StaticData.h index 448f1a4e7..20d36e4b8 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -171,6 +171,7 @@ protected: bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; bool m_onlyDistinctNBest; + bool m_PrintAlignmentInfo; bool m_needAlignmentInfo; bool m_PrintAlignmentInfoNbest; @@ -216,6 +217,8 @@ protected: bool m_outputWordGraph; //! whether to output word graph bool m_outputSearchGraph; //! whether to output search graph bool m_outputSearchGraphExtended; //! ... in extended format + bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF) + bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph #ifdef HAVE_PROTOBUF bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf #endif @@ -458,7 +461,7 @@ public: return m_nBestFilePath; } bool IsNBestEnabled() const { - return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() + return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif @@ -631,6 +634,12 @@ public: bool GetOutputSearchGraphExtended() const { return m_outputSearchGraphExtended; } + bool GetOutputSearchGraphSLF() const { + return m_outputSearchGraphSLF; + } + bool GetOutputSearchGraphHypergraph() const { + return m_outputSearchGraphHypergraph; + } #ifdef HAVE_PROTOBUF bool GetOutputSearchGraphPB() const { return m_outputSearchGraphPB; @@ -722,6 +731,9 @@ public: const std::string &GetAlignmentOutputFile() const { return m_alignmentOutputFile; } + bool PrintAlignmentInfo() const { + return m_PrintAlignmentInfo; + } bool PrintAlignmentInfoInNbest() const { return m_PrintAlignmentInfoNbest; } diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 70de9678b..fd33907de 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; - if (countEF < 3) D = kneserNey_D2; + else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 92c8a470e..cab91e92d 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) { if (m_options.isOrientationFlag()) outextractstrOrientation << orientationInfo; + if (m_options.isIncludeSentenceIdFlag()) { + outextractstr << " ||| " << sentence.sentenceID; + } + if (m_options.getInstanceWeightsFile().length()) { if (m_options.isTranslationFlag()) { outextractstr << " ||| " << sentence.weightString; @@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) { } } - if (m_options.isIncludeSentenceIdFlag()) { - outextractstr << " ||| " << sentence.sentenceID; - } if (m_options.isTranslationFlag()) outextractstr << "\n"; if (m_options.isTranslationFlag()) outextractstrInv << "\n"; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index e941aa95b..4ef6a1de6 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -13,10 +13,10 @@ chomp(@OUT); while() { chomp; if (/^) { my $line = shift(@OUT); $line = "" if $line =~ /NO BEST TRANSLATION/; if (/<\/seg>/) { - s/(]+> *).*(<\/seg>)/$1$line$2/; + s/(]+> *).*(<\/seg>)/$1$line$2/i; } else { - s/(]+> *)[^<]*/$1$line/; + s/(]+> *)[^<]*/$1$line/i; } } print $_."\n"; diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index 8f82ab8d9..beca70eb0 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -16,15 +16,15 @@ $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, - 'factored' => \$FACTORED, + 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, - 'syntax' => \$SYNTAX, - 'binarize' => \$BINARIZE, - 'mark-split' => \$MARK_SPLIT, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || @@ -155,34 +155,37 @@ sub apply { next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; $COUNT{$lc} = $count; $TRUECASE{$lc} = $factored_word; - $LABEL{$lc} = $label if $SYNTAX; + $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while() { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; - my @BUFFER; # for xml tags + my @BUFFER; # for xml tags foreach my $factored_word (split) { print " " unless $first; $first = 0; - # syntax: don't split xml - if ($SYNTAX && ($factored_word =~ /^$/)) { - push @BUFFER,$factored_word; - $first = 1; - next; - } - - # get case class - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - my $lc = lc($word); - + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + + print STDERR "considering $word ($lc)...\n" if $VERBOSE; # don't split frequent words - if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || + $lc !~ /[a-zA-Z]/) {; # has to have at least one letter + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; + print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; next; } diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 879212e6e..f1f8f9ef6 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span sub extract_sgml_tag_attribute { my ($name, $data) = @_; - ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); + ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : (); } ################################# diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index 49c89c299..012c143ac 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -6,11 +6,12 @@ use Getopt::Long "GetOptions"; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); - -my ($SRC,$INFILE); +my ($SRC,$INFILE,$UNBUFFERED); die("detruecase.perl < in > out") unless &GetOptions('headline=s' => \$SRC, - 'in=s' => \$INFILE); + 'in=s' => \$INFILE, + 'b|unbuffered' => \$UNBUFFERED); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index c83c30daa..2858cda61 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -4,7 +4,7 @@ use strict; use Getopt::Long "GetOptions"; -my ($SRC,$INFILE,$RECASE_MODEL); +my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED); my $MOSES = "moses"; my $LANGUAGE = "en"; # English by default; die("recase.perl --in file --model ini-file > out") @@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out") 'headline=s' => \$SRC, 'lang=s' => \$LANGUAGE, 'moses=s' => \$MOSES, - 'model=s' => \$RECASE_MODEL) + 'model=s' => \$RECASE_MODEL, + 'b|unbuffered' => \$UNBUFFERED) && defined($INFILE) && defined($RECASE_MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my %treated_languages = map { ($_,1) } qw/en cs/; die "I don't know any rules for $LANGUAGE. Use 'en' as the default." diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 0e2df27a2..517f5c7a1 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -8,9 +8,11 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # apply switches -my $MODEL; -die("truecase.perl --model truecaser < in > out") - unless &GetOptions('model=s' => \$MODEL); +my ($MODEL, $UNBUFFERED); +die("truecase.perl --model MODEL [-b] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) + && defined($MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my (%BEST,%KNOWN); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index f59cd5f86..986a2dfb5 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -171,7 +171,7 @@ if ($TIMING) # tokenize a batch of texts saved in an array # input: an array containing a batch of texts -# return: another array cotaining a batch of tokenized texts for the input array +# return: another array containing a batch of tokenized texts for the input array sub tokenize_batch { my(@text_list) = @_; diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index bea32052a..2865fe391 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1"; if (-e $l1input) { $opn = $l1input; } elsif (-e $l1input.".gz") { - $opn = "zcat $l1input.gz |"; + $opn = "gunzip -c $l1input.gz |"; } else { die "Error: $l1input does not exist"; } @@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2"; if (-e $l2input) { $opn = $l2input; } elsif (-e $l2input.".gz") { - $opn = "zcat $l2input.gz |"; + $opn = "gunzip -c $l2input.gz |"; } else { die "Error: $l2input does not exist"; } @@ -160,3 +160,4 @@ sub word_count { my @w = split(/ /,$line); return scalar @w; } + diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 8bef034de..86c8b300e 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -40,7 +40,8 @@ def printUsage(): def main(): parser = optparse.OptionParser() parser.add_option("-c", "--min-non-initial-rule-count", - action="store", dest="minCount", type="int", default="1", + action="store", dest="minCount", + type="float", default="0.0", help="prune non-initial rules where count is below N", metavar="N") (options, args) = parser.parse_args()