Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2013-03-15 20:38:42 +00:00
commit 18e8f12d5e
65 changed files with 2028 additions and 165 deletions

3
.gitmodules vendored
View File

@ -0,0 +1,3 @@
[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
path = contrib/arrow-pipelines/python/libs/pypeline
url = git://github.com/ianj-als/pypeline.git

View File

@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
Generally, for trouble installing external libraries, you should get support
directly from the library maker:
Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user

2
NOTICE
View File

@ -1,3 +1,5 @@
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
This code includes data from czech wiktionary (also czech abbreviations).

View File

@ -0,0 +1,32 @@
Arrow Based Moses Training Pipeline
===================================
To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
$ git submodule init
This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
$ cd libs/pypeline
$ python setup.py install
Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
Three environment variables need to be set before the manager.py script can be run, they are:
- MOSES_HOME : The directory where Moses has been cloned, or installed,
- IRSTLM : The installation directory of your IRSTLM, and
- GIZA_HOME : The installation directory of GIZA++.
The manager.py script takes four positional command-line arguments:
- The source language code,
- The target language code,
- The source corpus file. This file *must* be cleaned prior to use, and
- The target corpus file. This file *must* be cleaned prior to use.
For example, run the manager.py script with:
$ python manager.py en lt cleantrain.en cleantrain.lt

@ -0,0 +1 @@
Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb

View File

@ -0,0 +1,192 @@
import logging
import os
from concurrent.futures import Future, ThreadPoolExecutor
from functools import partial
from pypeline.helpers.parallel_helpers import eval_pipeline, \
cons_function_component, \
cons_wire, \
cons_split_wire, \
cons_unsplit_wire, \
cons_dictionary_wire
#
# Some logging please
#
FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
logging.basicConfig(format = FORMAT, level = logging.DEBUG)
logger = logging.getLogger("manager")
# Build the pipeline components
def build_components(components, configuration, executor):
pipeline_components = dict()
pipeline_configuration = dict()
for component_id, module_name in components.items():
logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
module = __import__(module_name, fromlist = ['configure', 'initialise'])
# Component builds its own configuration object
config_func = getattr(module, 'configure')
component_config = config_func(configuration)
pipeline_configuration.update(component_config)
# Now build the component
init_func = getattr(module, 'initialise')
component_function = init_func(component_config)
# A wrapper for the component's function that submits to the executor
def get_component_function_wrapper(inner_function, comp_id, mod_name):
def component_function_wrapper(a, s):
logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
(comp_id, mod_name, a, s))
return inner_function(a, s)
return component_function_wrapper
# Arrowize the component
component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
# And store
pipeline_components[component_id] = component
return pipeline_components, pipeline_configuration
# Go!
def main(src_lang, trg_lang, src_filename, trg_filename):
# Global configuration
# One day, this configuration shall be constructed from
# command line options, or a properties file.
configuration = {
'moses_installation_dir': os.environ['MOSES_HOME'],
'irstlm_installation_dir': os.environ['IRSTLM'],
'giza_installation_dir': os.environ['GIZA_HOME'],
'src_lang': src_lang,
'src_tokenisation_dir': './tokenisation',
'trg_lang': trg_lang,
'trg_tokenisation_dir': './tokenisation',
'segment_length_limit': 60,
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': './language-model',
'translation_model_directory': './translation-model',
'mert_working_directory': './mert',
'evaluation_data_size': 100,
'development_data_size': 100
}
# The modules to load
# In the future, the components shall be specified in some kind
# pipeline description file.
component_modules = {
'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
'cleanup': 'training.components.cleanup.cleanup',
'data_split': 'training.components.data_split.data_split',
'irstlm_build': 'training.components.irstlm_build.irstlm_build',
'model_training': 'training.components.model_training.model_training',
'mert': 'training.components.mert.mert'
}
# The thread pool
executor = ThreadPoolExecutor(max_workers = 3)
# Phew, build the required components
components, component_config = build_components(component_modules, configuration, executor)
#
# Wire up components
# Description of wiring should be, in the future, alongside the component
# specification in some kind of confuguration file. Components shall be
# declared then used, i.e., bind a component instance to a unique component
# identifier, then wire component instances together by identifier.
#
#
# Tokenisation of source and target...
#
# IRSTLM Build components
irstlm_build_component = cons_split_wire() >> \
(cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
components['irstlm_build']).second() >> \
cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
'trg_language_model_filename': b['compiled_lm_filename']})
# The complete tokenisation component
tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
irstlm_build_component.second() >> \
cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
'trg_filename': b['tokenised_trg_filename'],
'trg_language_model_filename': b['trg_language_model_filename']})
#
# Cleanup and Data Spliting...
#
#
# A function that clips off the last '.' delimited string
#
def clip_last_bit(filename):
bn = os.path.basename(filename)
directory = os.path.dirname(filename)
bits = bn.split(".")
bits.pop()
return os.path.join(directory, ".".join(bits))
cleanup_datasplit_component = components['cleanup'] >> \
cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
'trg_filename': a['cleaned_trg_filename']}) >> \
components['data_split'] >> \
cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
'eval_src_filename': a['eval_src_filename'],
'eval_trg_filename': a['eval_trg_filename']})
#
# Translation model training
#
translation_model_component = cons_split_wire() >> \
components['model_training'].first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': b['eval_src_filename']})
#
# The whole pipeline
#
pipeline = tokenisation_component >> \
cons_split_wire() >> \
(cleanup_datasplit_component >> translation_model_component).first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': clip_last_bit(t['development_data_filename']),
'trg_language_model_filename': b['trg_language_model_filename'],
'trg_language_model_order': 3,
'trg_language_model_type': 9}) >> \
components['mert']
#
# The input to the pipeline
#
value = {'src_filename': src_filename,
'trg_filename': trg_filename}
#
# Evaluate the pipeline
#
logger.info("Evaluating pipeline with input [%s]..." % value)
new_value = eval_pipeline(executor, pipeline, value, component_config)
#
# Wait for all components to finish
#
executor.shutdown(True)
logger.info("Pipeline evaluated to %s" % new_value)
if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

View File

@ -0,0 +1,11 @@
import subprocess
def cat(filename, content):
fh = open(filename, "w")
for line in content:
#print(line, file=fh)
print >> fh, line
fh.close()
def diff(filename1, filename2):
subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)

View File

@ -0,0 +1,125 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits[-1] = "clean"
return ".".join(bits)
def _filter_main(value, config):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,109 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print(l1, end='', file=ofh1)
print(l2, end='', file=ofh2)
def _filter_main(config, value):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
ifh1 = open(value['src_filename'], "r")
ifh2 = open(value['trg_filename'], "r")
ofh1 = open(value['cleaned_src_filename'], "w")
ofh2 = open(value['cleaned_trg_filename'], "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': value['cleaned_src_filename'],
'cleaned_trg_filename': value['cleaned_trg_filename']}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return cons_function_component(_filter_main)
if __name__ == '__main__':
import os
import tempfile
import training.components.shared.test as thelp
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_filename': src_filename[1] + ".clean",
'cleaned_trg_filename': trg_filename[1] + ".clean",
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
run_pipeline(box, box_config, box_eval)
thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,146 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['evaluate_size'] = args['evaluation_data_size']
result['development_size'] = args['development_data_size']
return result
def initialise(config):
def _copy(size, inp, ofh1, ofh2):
try:
while size != 0:
(l1, l2) = inp.next()
print >>ofh1, l1,
print >>ofh2, l2,
size -= 1
except StopIteration:
pass
def _make_split_filename(filename, data_set):
bits = filename.split(".")
last = bits.pop()
lang_code = bits.pop()
bits.append(last)
bits.append(data_set)
bits.append(lang_code)
new_filename = ".".join(bits)
return new_filename
def _splitter_main(value, config):
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
inp = iter(zip(ifh1, ifh2))
result = {}
for (data_set, size) in [
('devel', config['development_size']),
('eval', config['evaluate_size']),
('train', -1)
]:
output_src_filename = _make_split_filename(input_src_filename, data_set)
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
ofh1 = open(output_src_filename, "w")
ofh2 = open(output_trg_filename, "w")
_copy(size, inp, ofh1, ofh2)
result[data_set + '_src_filename'] = output_src_filename
result[data_set + '_trg_filename'] = output_trg_filename
return result
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _splitter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {
'evaluation_data_size': 7,
'development_data_size': 13,
}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'devel_src_expected': src_filename[1] + ".devel.expected",
'devel_trg_expected': trg_filename[1] + ".devel.expected",
'eval_src_expected': src_filename[1] + ".eval.expected",
'eval_trg_expected': trg_filename[1] + ".eval.expected",
'train_src_expected': src_filename[1] + ".train.expected",
'train_trg_expected': trg_filename[1] + ".train.expected",
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
for data_set in ['devel', 'eval', 'train']:
for lang in ['src', 'trg']:
filename = output[data_set + '_' + lang + '_filename']
filename_expected = box_eval[data_set + '_' + lang + '_expected']
thelp.diff(filename_expected, filename)
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line(range(50)))
thelp.cat(box_eval['trg_filename'], _line(range(50)))
#expected output:
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,106 @@
import os
import shutil
import subprocess
import tempfile
from pypeline.helpers.helpers import cons_function_component
def configure(args):
config = dict()
config['irstlm_install_directory'] = args['irstlm_installation_dir']
config['smoothing_method'] = args['irstlm_smoothing_method']
config['lm_directory'] = args['language_model_directory']
return config
def initialise(config):
def process(a, s):
# Create the LM directory if we need to
if os.path.exists(s['lm_directory']) is False:
os.makedirs(s['lm_directory'])
# The filename of the file to chew through
start_end_input_filename = a['input_filename']
if os.path.exists(start_end_input_filename) is False:
raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
# Derive the output file name for the add start-end marker processor
filename_bits = os.path.basename(start_end_input_filename).split(".")
filename_bits[2] = "sb";
start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the output file name of the LM build
filename_bits[2] = "lm"
lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the compiled LM file name
filename_bits[2] = "arpa"
compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# First thing to do is add start and end markers
start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
infile = open(start_end_input_filename, 'r')
outfile = open(start_end_output_filename, 'w')
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
if return_code:
raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
start_end_input_filename, start_end_output_filename, return_code)
# Next build the language model
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
try:
build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
"-i", start_end_output_filename,
"-t", tmp_dir,
"-p",
"-s", s['smoothing_method'],
"-o", lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
return_code = subprocess.check_call(build_lm_cmdline)
if return_code:
raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
finally:
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
# Compile the LM
lm_filename = lm_filename + ".gz"
compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
"--text", "yes",
lm_filename,
compiled_lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
return_code = subprocess.check_call(compile_lm_cmdline)
if return_code:
raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
output = {'add_start_end_filename': start_end_output_filename,
'lm_filename': lm_filename,
'compiled_lm_filename': compiled_lm_filename}
print "IRSTLM Build: Output = %s" % output
return output
return process
if __name__ == '__main__':
from pypeline.helpers.helpers import eval_pipeline
lm_dir = os.environ["PWD"]
configuration = {'irstlm_root': os.environ["IRSTLM"],
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': lm_dir}
component_config = configure(configuration)
component = initialise(component_config)
value = eval_pipeline(component,
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
component_config)
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
print "Target: %s" % target
if value != target:
raise Exception("Massive fail!")

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['mert_working_dir'] = args['mert_working_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['development_data_filename'])
lm_file = os.path.abspath(a['trg_language_model_filename'])
lm_order = int(a['trg_language_model_order'])
lm_type = int(a['trg_language_model_type'])
orig_moses_ini = os.path.abspath(a['moses_ini_file'])
if not os.path.exists(orig_moses_ini):
raise Exception, "Error: Input moses.ini does not exist"
workdir = os.path.abspath(config['mert_working_dir'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
moses_install_dir = os.path.abspath(config['moses_installation_dir'])
mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
bin_dir = os.path.join(moses_install_dir, 'bin')
moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
src_file = infilename + '.' + config['src_lang']
ref_file = infilename + '.' + config['trg_lang']
logfile = os.path.join(workdir, 'log')
#change lm configuration in moses ini
moses_ini = os.path.join(workdir, 'trained-moses.ini')
cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
cmd = cmd % locals()
os.system(cmd)
#the command
cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
new_mosesini = os.path.join(workdir, 'moses.ini')
if not os.path.exists(new_mosesini):
raise Exception, 'Failed MERT'
return {'moses_ini_file':new_mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.path.abspath('../../../../'),
'mert_working_dir':'../../../../../tuning'}
values = {'development_data_filename':'../../../../../corpus/tune',
'moses_ini_file':'../../../../../model/model/moses.ini',
'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
'trg_language_model_type':9,
'trg_language_model_order':4}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['external_bin_dir'] = args['giza_installation_dir']
result['model_directory'] = args['translation_model_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['training_data_filename'])
workdir = os.path.abspath(config['model_directory'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
src_lang = config['src_lang'].lower()
trg_lang = config['trg_lang'].lower()
external_bin = os.path.abspath(config['external_bin_dir'])
#create a dummy lm file
dummy_lmfile = workdir + os.sep + 'dummy.lm'
f = open(dummy_lmfile, 'w')
print >> f, "dummy lm file"
f.close()
logfile = workdir + os.sep + 'log'
#the command
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
if not os.path.exists(mosesini):
raise Exception, 'Failed training model'
return {'moses_ini_file':mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.environ['MOSES_HOME'],
'giza_installation_dir':os.environ['GIZA_HOME'],
'translation_model_directory':'model-dir'}
values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['src_tokenisation_dir'] = args['src_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['src_filename']
outfilename = Tokenizer.batch_tokenise(
config['src_lang'],
config['moses_installation_dir'],
infilename,
config['src_tokenisation_dir'])
return {'tokenised_src_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'de',
'src_tokenisation_dir':'tmptok',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'src_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,3 @@
asdfweoih
awfwoeijf awefo
what's this

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
import sys, os, subprocess
class Tokenizer:
@staticmethod
def batch_tokenise(lang, mosesdir, infilename, workdir):
print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
if not os.path.exists(workdir):
os.makedirs(workdir)
tok = Tokenizer(lang, mosesdir)
basefilename = os.path.basename(infilename)
outfilename = workdir + os.sep + basefilename + '.tok'
tok.file_tokenise(infilename, outfilename)
return outfilename
def __init__(self, lang, mosesdir):
self.arrows = None
self.lang = lang
#check the perl tokenizer is here
#path = os.path.dirname(os.path.abspath(__file__))
path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
self.perltok = path + os.sep + 'tokenizer.perl'
if not os.path.exists(path):
raise Exception, "Perl tokenizer does not exists"
def file_tokenise(self, infilename, outfilename):
cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
if __name__ == '__main__':
#do some test
pass

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['trg_lang'] = args['trg_lang']
result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['trg_filename']
outfilename = Tokenizer.batch_tokenise(
config['trg_lang'],
config['moses_installation_dir'],
infilename,
config['trg_tokenisation_dir'])
return {'tokenised_trg_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'trg_lang':'de',
'trg_tokenisation_dir':'tmptoktrg',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'trg_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -133,8 +133,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -18,11 +18,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
@ -119,5 +122,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/extractor"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/extractor"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -131,7 +131,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/lm"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/lm"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -141,11 +141,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
</link>
<link>
<name>build_binary.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
</link>
<link>
<name>clean.sh</name>
<type>1</type>
@ -176,11 +171,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
</link>
<link>
<name>fragment.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
</link>
<link>
<name>left.hh</name>
<type>1</type>
@ -211,11 +201,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
</link>
<link>
<name>max_order.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
</link>
<link>
<name>max_order.hh</name>
<type>1</type>
@ -241,11 +226,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
</link>
<link>
<name>ngram_query.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
</link>
<link>
<name>ngram_query.hh</name>
<type>1</type>

View File

@ -7,7 +7,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@ -23,13 +23,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@ -45,11 +46,8 @@
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
</fileInfo>
<sourceEntries>
<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
@ -61,7 +59,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@ -119,5 +117,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,7 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@ -154,8 +155,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,8 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
@ -155,8 +157,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?>
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@ -9,7 +7,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
<entry flags="RESOLVED" kind="libraryFile" name="moses"/>
<entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@ -26,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -152,8 +150,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -127,6 +127,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/search"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/search"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>

View File

@ -156,11 +156,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
</link>
<link>
<name>vertex_generator.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
</link>
<link>
<name>vertex_generator.hh</name>
<type>1</type>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -136,8 +136,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/util"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/util"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

42
contrib/rpm/README Normal file
View File

@ -0,0 +1,42 @@
Building Moses RPM
==================
*** WARNING ***
Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
*** WARNING ***
Building the RPM SPEC file
--------------------------
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
- The Git repository from which an installer will be built,
- The branch in the Git repository to build, and
- The version of the installed Moses distribution.
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
Building the RPM
----------------
Change directory to $HOME/rpmbuild, and build the binary RPM with:
$ rpmbuild -bb SPECS/moses.spec
This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
Building a Debian package
-------------------------
The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
https://help.ubuntu.com/community/RPM/AlienHowto

63
contrib/rpm/build_source.sh Executable file
View File

@ -0,0 +1,63 @@
#!/bin/bash
BRANCH="master"
declare -i NO_RPM_BUILD=0
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
function usage() {
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
exit 1
}
if [ $# -lt 4 ]; then
usage
fi
while getopts r:b:v:nh OPTION
do
case "$OPTION" in
r) REPO="${OPTARG}";;
b) BRANCH="${OPTARG}";;
v) VERSION="${OPTARG}";;
n) NO_RPM_BUILD=1;;
[h\?]) usage;;
esac
done
if [ ! -d ./rpmbuild ]; then
echo "RPM build directory not in current working direcotry"
exit 1
fi
declare -r MOSES_DIR="moses-${VERSION}"
git clone ${REPO} ${MOSES_DIR}
if [ $? -ne 0 ]; then
echo "Failed to clone Git repository ${REPO}"
exit 3
fi
cd ${MOSES_DIR}
git checkout ${BRANCH}
if [ $? -ne 0 ]; then
echo "Failed to checkout branch ${BRANCH}"
exit 3
fi
cd ..
tar -cf moses-${VERSION}.tar ${MOSES_DIR}
gzip -f9 moses-${VERSION}.tar
if [ ${NO_RPM_BUILD} -eq 0 ]; then
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
mkdir -p ${HOME}/rpmbuild/SPECS
fi
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
mkdir -p ${HOME}/rpmbuild/SOURCES
fi
mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
fi
rm -Rf ${MOSES_DIR}

View File

@ -0,0 +1,65 @@
Name: moses
Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
Version: ___RPM_VERSION__
Release: 1
URL: http://www.statmt.org/moses/
Source0: %{name}-%{version}.tar.gz
License: LGPL
Group: Development/Tools
Vendor: Capita Translation and Interpreting
Packager: Ian Johnson <ian.johnson@capita-ti.com>
Requires: boost >= 1.48, python >= 2.6, perl >= 5
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
%description
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
%prep
%setup -q
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
cd $RPM_BUILD_DIR
tar -zxf irstlm-5.70.04.tgz
tar -zxf giza-pp-v1.0.7.tgz
cd irstlm-5.70.04
bash regenerate-makefiles.sh --force
./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
make
make install
cd ../giza-pp
make
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
%build
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
%install
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
cp -R bin $RPM_BUILD_ROOT/opt/moses
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
%clean
%files
%defattr(-,root,root)
/opt/moses/bin/*
/opt/moses/scripts/analysis/*
/opt/moses/scripts/ems/*
/opt/moses/scripts/generic/*
/opt/moses/scripts/other/*
/opt/moses/scripts/recaser/*
/opt/moses/scripts/regression-testing/*
/opt/moses/scripts/share/*
/opt/moses/scripts/tokenizer/*
/opt/moses/scripts/training/*
/opt/moses/irstlm-5.70.04/*
/opt/moses/giza++-v1.0.7/*

View File

@ -620,10 +620,27 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
template <class T>
void ShiftOffsets(vector<T> &offsets, T shift)
{
T currPos = shift;
for (size_t i = 0; i < offsets.size(); ++i) {
shift += offsets[i];
offsets[i] += shift;
if (offsets[i] == 0) {
offsets[i] = currPos;
++currPos;
}
else {
currPos += offsets[i];
}
}
}
size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
{
size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
for (size_t i = 0; i < prevHypos.size(); ++i) {
size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
ret -= (childSize - 1);
}
return ret;
}
size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
size_t thisSourceSize = CalcSourceSize(hypo);
// position of each terminal word in translation rule, irrespective of alignment
// if non-term, number is undefined
vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0);
const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
// 1st. calc source size
// calc source size
size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize;
// 2nd. calc target size. Recursively look thru child hypos
// calc target size.
// Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
targetOffsets[targetPos] = targetSize;
@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
}
}
// 3rd. shift offsets
// convert position within translation rule to absolute position within
// source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
const set<size_t> &targets = retAlignmentsS2T[source];
set<size_t>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t target = *iter;
pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
,target + targetOffsets[target]);
AlignmentInfo::const_iterator iter;
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
const std::pair<size_t,size_t> &align = *iter;
size_t relSource = align.first;
size_t relTarget = align.second;
size_t absSource = sourceOffsets[relSource];
size_t absTarget = targetOffsets[relTarget];
pair<size_t, size_t> alignPoint(absSource, absTarget);
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second);
}
}
return totalTargetSize;
@ -702,6 +723,7 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
{
ostringstream out;
if (hypo) {
Alignments retAlign;
OutputAlignment(retAlign, hypo, 0);
@ -711,6 +733,7 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
const pair<size_t, size_t> &alignPoint = *iter;
out << alignPoint.first << "-" << alignPoint.second << " ";
}
}
out << endl;
m_alignmentInfoCollector->Write(translationId, out.str());
@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
size_t thisSourceSize = CalcSourceSize(hypo);
// position of each terminal word in translation rule, irrespective of alignment
// if non-term, number is undefined
vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0);
const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const ChartHypothesis *prevHypo = prevHypos[sourceInd];
// 1st. calc source size
// calc source size
size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize;
// 2nd. calc target size. Recursively look thru child hypos
// calc target size.
// Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
targetOffsets[targetPos] = targetSize;
@ -760,28 +788,28 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
}
}
// 3rd. shift offsets
// convert position within translation rule to absolute position within
// source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
const set<size_t> &targets = retAlignmentsS2T[source];
set<size_t>::const_iterator iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) {
size_t target = *iter;
pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
,target + targetOffsets[target]);
AlignmentInfo::const_iterator iter;
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
const std::pair<size_t,size_t> &align = *iter;
size_t relSource = align.first;
size_t relTarget = align.second;
size_t absSource = sourceOffsets[relSource];
size_t absTarget = targetOffsets[relTarget];
pair<size_t, size_t> alignPoint(absSource, absTarget);
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second);
}
}
return totalTargetSize;
}

View File

@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
}
}
ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
const StaticData &staticData = StaticData::Instance();
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
return file;
}
/***
* print surface factor only for the given phrase
*/
@ -262,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
out << std::endl;
}
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
{
std::vector<const Hypothesis *> edges;
const Hypothesis *currentHypo = hypo;
while (currentHypo) {
edges.push_back(currentHypo);
currentHypo = currentHypo->GetPrevHypo();
}
OutputAlignment(out, edges);
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;

View File

@ -117,6 +117,8 @@ public:
return *m_outputSearchGraphStream;
}
std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
std::ostream &GetDetailedTranslationReportingStream() {
assert (m_detailedTranslationReportingStream);
return *m_detailedTranslationReportingStream;
@ -137,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
}

View File

@ -83,14 +83,18 @@ public:
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
OutputCollector* detailedTranslationCollector,
OutputCollector* alignmentInfoCollector,
OutputCollector* unknownsCollector) :
OutputCollector* unknownsCollector,
bool outputSearchGraphSLF,
bool outputSearchGraphHypergraph) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector),
m_unknownsCollector(unknownsCollector) {}
m_unknownsCollector(unknownsCollector),
m_outputSearchGraphSLF(outputSearchGraphSLF),
m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
/** Translate one sentence
* gets called by main function implemented at end of this source file */
@ -143,6 +147,42 @@ public:
#endif
}
// Output search graph in HTK standard lattice format (SLF)
if (m_outputSearchGraphSLF) {
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
if (file->is_open() && file->good()) {
ostringstream out;
fix(out,PRECISION);
manager.OutputSearchGraphAsSLF(m_lineNumber, out);
*file << out.str();
file -> flush();
} else {
TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
}
}
// Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
if (m_outputSearchGraphHypergraph) {
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
if (file->is_open() && file->good()) {
ostringstream out;
fix(out,PRECISION);
manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
*file << out.str();
file -> flush();
} else {
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
}
file -> close();
delete file;
}
// apply decision rule and output best translation(s)
if (m_outputCollector) {
ostringstream out;
@ -174,6 +214,11 @@ public:
staticData.GetOutputFactorOrder(),
staticData.GetReportSegmentation(),
staticData.GetReportAllFactors());
if (staticData.PrintAlignmentInfo()) {
out << "||| ";
OutputAlignment(out, bestHypo);
}
OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
IFVERBOSE(1) {
debug << "BEST TRANSLATION: " << *bestHypo << endl;
@ -311,6 +356,8 @@ private:
OutputCollector* m_detailedTranslationCollector;
OutputCollector* m_alignmentInfoCollector;
OutputCollector* m_unknownsCollector;
bool m_outputSearchGraphSLF;
bool m_outputSearchGraphHypergraph;
std::ofstream *m_alignmentStream;
@ -367,6 +414,63 @@ static void ShowWeights()
}
size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
{
size_t numScoreComps = ff->GetNumScoreComponents();
if (numScoreComps != ScoreProducer::unlimited) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
if (numScoreComps > 1) {
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
<< i
<< "=" << values[i] << endl;
}
} else {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
<< "=" << values[0] << endl;
}
return index+numScoreComps;
} else {
cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
assert(false);
return 0;
}
}
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
}
}
} //namespace
/** main function of the command line version of the decoder **/
@ -391,20 +495,20 @@ int main(int argc, char** argv)
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
Parameter* params = new Parameter();
if (!params->LoadParam(argc,argv)) {
Parameter params;
if (!params.LoadParam(argc,argv)) {
exit(1);
}
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) {
if (!StaticData::LoadDataStatic(&params, argv[0])) {
exit(1);
}
// setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) {
if (params.isParamSpecified("show-weights")) {
ShowWeights();
exit(0);
}
@ -430,6 +534,14 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
OutputFeatureWeightsForHypergraph(*weightsOut);
weightsOut->flush();
weightsOut->close();
delete weightsOut;
}
// initialize output streams
// note: we can't just write to STDOUT or files
@ -533,7 +645,9 @@ int main(int argc, char** argv)
searchGraphCollector.get(),
detailedTranslationCollector.get(),
alignmentInfoCollector.get(),
unknownsCollector.get() );
unknownsCollector.get(),
staticData.GetOutputSearchGraphSLF(),
staticData.GetOutputSearchGraphHypergraph());
// execute task
#ifdef WITH_THREADS
pool.Submit(task);
@ -551,6 +665,8 @@ int main(int argc, char** argv)
pool.Stop(true); //flush remaining jobs
#endif
delete ioWrapper;
} catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE;

View File

@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
m_emptyAlignmentInfo = Add(pairs);
}
AlignmentInfoCollection::~AlignmentInfoCollection()
{}
const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
{
return *m_emptyAlignmentInfo;

View File

@ -55,6 +55,7 @@ class AlignmentInfoCollection
//! Only a single static variable should be created.
AlignmentInfoCollection();
~AlignmentInfoCollection();
static AlignmentInfoCollection s_instance;

View File

@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
// prune arc list only if there too many arcs

View File

@ -36,8 +36,9 @@ using namespace std;
namespace Moses
{
LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
LanguageModelSingleFactor::~LanguageModelSingleFactor()
{
}
struct PointerState : public FFState {
const void* lmstate;
@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
m_beginSentenceState = new PointerState(NULL);
}
LanguageModelPointerState::~LanguageModelPointerState() {}
LanguageModelPointerState::~LanguageModelPointerState()
{
delete m_nullContextState;
delete m_beginSentenceState;
}
const FFState *LanguageModelPointerState::GetNullContextState() const
{

View File

@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif
#include <algorithm>
#include <limits>
#include <cmath>
#include <limits>
#include <map>
#include <set>
#include "Manager.h"
#include "TypeDef.h"
#include "Util.h"
@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "rule.pb.h"
#endif
#include "util/exception.hh"
using namespace std;
namespace Moses
{
Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
:m_lineNumber(lineNumber)
,m_system(system)
:m_system(system)
,m_transOptColl(source.CreateTranslationOptionCollection(system))
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,interrupted_flag(0)
,m_hypoId(0)
,m_lineNumber(lineNumber)
,m_source(source)
{
m_system->InitializeBeforeSentenceProcessing(source);
@ -628,6 +632,420 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
}
void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
}
}
void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
// outputSearchGraphStream << endl;
// outputSearchGraphStream << (*hypo) << endl;
// const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
// outputSearchGraphStream << scoreCollection << endl;
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
}
}
void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
}
}
size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
size_t numScoreComps = ff->GetNumScoreComponents();
if (numScoreComps != ScoreProducer::unlimited) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
<< " " << ff->GetScoreProducerWeightShortName()
<< " " << (i+1) << " of " << numScoreComps << endl
<< "x" << (index+i) << "scale=" << values[i] << endl;
}
return index+numScoreComps;
} else {
cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
assert(false);
return 0;
}
}
size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
// { const FeatureFunction* sp = ff;
// const FVector& m_scores = scoreCollection.GetScoresVector();
// FVector& scores = const_cast<FVector&>(m_scores);
// std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
// // std::cout << "prefix==" << prefix << endl;
// // cout << "m_scores==" << m_scores << endl;
// // cout << "m_scores.size()==" << m_scores.size() << endl;
// // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
// // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
// // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
// // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
// // }
// for(int i=0, n=v.size(); i<n; i+=1) {
// // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
// }
// }
// FVector featureValues = scoreCollection.GetVectorForProducer(ff);
// outputSearchGraphStream << featureValues << endl;
const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();//featureValues.coreSize();
// if (numScoreComps != ScoreProducer::unlimited) {
// vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
}
return index+numScoreComps;
// } else {
// cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
// assert(false);
// return 0;
// }
}
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo) {
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
}
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();
if (numScoreComps > 1) {
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " ";
}
} else {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " ";
}
return index+numScoreComps;
}
/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
{
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
map<int,int> mosesIDToHypergraphID;
// map<int,int> hypergraphIDToMosesID;
set<int> terminalNodes;
multimap<int,int> hypergraphIDToArcs;
long numNodes = 0;
long endNode = 0;
{
long hypergraphHypothesisID = 0;
for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
// Get an id number for the previous hypothesis
const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
if (prevHypo!=NULL) {
int mosesPrevHypothesisID = prevHypo->GetId();
if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
hypergraphHypothesisID += 1;
}
}
// Get an id number for this hypothesis
int mosesHypothesisID;
if (searchGraph[arcNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
}
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
// Final arc to end node, representing the end of the sentence </s>
terminalNodes.insert(hypergraphHypothesisID);
}
hypergraphHypothesisID += 1;
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
}
// Unique end node
endNode = hypergraphHypothesisID;
// mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
numNodes = endNode + 1;
}
long numArcs = searchGraph.size() + terminalNodes.size();
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
if (count > 0) {
outputSearchGraphStream << count << endl;
pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
int lineNumber = (*it).second;
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
int mosesHypothesisID;// = thisHypo->GetId();
if (searchGraph[lineNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
}
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
util::Exception,
"Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
". There are " << numNodes << " nodes in the search lattice."
);
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
if (prevHypo==NULL) {
outputSearchGraphStream << "<s> ||| " << endl;
} else {
int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
UTIL_THROW_IF(
(startNode >= hypergraphHypothesisID),
util::Exception,
"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
"The nodes must be output in topological order. The code attempted to violate this restriction."
);
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
int targetWordCount = targetPhrase.GetSize();
outputSearchGraphStream << "[" << startNode << "]";
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
}
outputSearchGraphStream << " ||| ";
OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
outputSearchGraphStream << endl;
}
}
}
}
// Print node and arc(s) for end of sentence </s>
outputSearchGraphStream << terminalNodes.size() << endl;
for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
}
}
/**! Output search graph in HTK standard lattice format (SLF) */
void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
{
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
long numArcs = 0;
long numNodes = 0;
map<int,int> nodes;
set<int> terminalNodes;
// Unique start node
nodes[0] = 0;
for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
numArcs += targetWordCount;
int hypothesisID = searchGraph[arcNumber].hypo->GetId();
if (nodes.count(hypothesisID) == 0) {
numNodes += targetWordCount;
nodes[hypothesisID] = numNodes;
//numNodes += 1;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
numArcs += 1;
}
}
}
numNodes += 1;
// Unique end node
nodes[numNodes] = numNodes;
outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
outputSearchGraphStream << "VERSION=1.1" << endl;
outputSearchGraphStream << "base=2.71828182845905" << endl;
outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
outputSearchGraphStream << "LINKS=" << numArcs << endl;
OutputFeatureWeightsForSLF(outputSearchGraphStream);
for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
if (prevHypo) {
int startNode = nodes[prevHypo->GetId()];
int endNode = nodes[thisHypo->GetId()];
bool terminalNode = (searchGraph[lineNumber].forward == -1);
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
int targetWordCount = targetPhrase.GetSize();
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
int x = (targetWordCount-targetWordIndex);
outputSearchGraphStream << "J=" << arcNumber;
if (targetWordIndex==0) {
outputSearchGraphStream << " S=" << startNode;
} else {
outputSearchGraphStream << " S=" << endNode - x;
}
outputSearchGraphStream << " E=" << endNode - (x-1)
<< " W=" << targetPhrase.GetWord(targetWordIndex);
OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
outputSearchGraphStream << endl;
arcNumber += 1;
}
if (terminalNode && terminalNodes.count(endNode) == 0) {
terminalNodes.insert(endNode);
outputSearchGraphStream << "J=" << arcNumber
<< " S=" << endNode
<< " E=" << numNodes
<< endl;
arcNumber += 1;
}
}
}
}
void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
const SearchGraphNode& searchNode)
{

View File

@ -93,6 +93,19 @@ class Manager
Manager(Manager const&);
void operator=(Manager const&);
const TranslationSystem* m_system;
private:
// Helper functions to output search graph in HTK standard lattice format
void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
// Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
protected:
// data
// InputType const& m_source; /**< source sentence to be translated */
@ -103,6 +116,7 @@ protected:
size_t interrupted_flag;
std::auto_ptr<SentenceStats> m_sentenceStats;
int m_hypoId; //used to number the hypos as they are created.
size_t m_lineNumber;
void GetConnectedGraph(
std::map< int, bool >* pConnected,
@ -113,7 +127,6 @@ protected:
public:
size_t m_lineNumber;
InputType const& m_source; /**< source sentence to be translated */
Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
~Manager();
@ -137,6 +150,8 @@ public:
#endif
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
const InputType& GetSource() const {
return m_source;

View File

@ -130,6 +130,8 @@ Parameter::Parameter()
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
@ -177,6 +179,7 @@ Parameter::Parameter()
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("alignment-output-file", "print output word alignments into given file");

View File

@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
}
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
// factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment
// alignments
SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
if (m_PrintAlignmentInfo) {
m_needAlignmentInfo = true;
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true;
@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_outputSearchGraph = true;
m_outputSearchGraphExtended = true;
} else
} else {
m_outputSearchGraph = false;
}
if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
m_outputSearchGraphSLF = true;
} else {
m_outputSearchGraphSLF = false;
}
if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
m_outputSearchGraphHypergraph = true;
} else {
m_outputSearchGraphHypergraph = false;
}
#ifdef HAVE_PROTOBUF
if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {

View File

@ -171,6 +171,7 @@ protected:
bool m_reportAllFactorsNBest;
std::string m_detailedTranslationReportingFilePath;
bool m_onlyDistinctNBest;
bool m_PrintAlignmentInfo;
bool m_needAlignmentInfo;
bool m_PrintAlignmentInfoNbest;
@ -216,6 +217,8 @@ protected:
bool m_outputWordGraph; //! whether to output word graph
bool m_outputSearchGraph; //! whether to output search graph
bool m_outputSearchGraphExtended; //! ... in extended format
bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
#ifdef HAVE_PROTOBUF
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif
@ -458,7 +461,7 @@ public:
return m_nBestFilePath;
}
bool IsNBestEnabled() const {
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
#ifdef HAVE_PROTOBUF
|| m_outputSearchGraphPB
#endif
@ -631,6 +634,12 @@ public:
bool GetOutputSearchGraphExtended() const {
return m_outputSearchGraphExtended;
}
bool GetOutputSearchGraphSLF() const {
return m_outputSearchGraphSLF;
}
bool GetOutputSearchGraphHypergraph() const {
return m_outputSearchGraphHypergraph;
}
#ifdef HAVE_PROTOBUF
bool GetOutputSearchGraphPB() const {
return m_outputSearchGraphPB;
@ -722,6 +731,9 @@ public:
const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile;
}
bool PrintAlignmentInfo() const {
return m_PrintAlignmentInfo;
}
bool PrintAlignmentInfoInNbest() const {
return m_PrintAlignmentInfoNbest;
}

View File

@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (kneserNeyFlag) {
float D = kneserNey_D3;
if (countEF < 2) D = kneserNey_D1;
if (countEF < 3) D = kneserNey_D2;
else if (countEF < 3) D = kneserNey_D2;
if (D > countEF) D = countEF - 0.01; // sanity constraint
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct

View File

@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo;
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString;
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
}
}
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n";

View File

@ -13,10 +13,10 @@ chomp(@OUT);
while(<SRC>) {
chomp;
if (/^<srcset/) {
s/<srcset/<tstset trglang="$language"/;
s/<srcset/<tstset trglang="$language"/i;
}
elsif (/^<\/srcset/) {
s/<\/srcset/<\/tstset/;
s/<\/srcset/<\/tstset/i;
}
elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//;
@ -26,10 +26,10 @@ while(<SRC>) {
my $line = shift(@OUT);
$line = "" if $line =~ /NO BEST TRANSLATION/;
if (/<\/seg>/) {
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
}
else {
s/(<seg[^>]+> *)[^<]*/$1$line/;
s/(<seg[^>]+> *)[^<]*/$1$line/i;
}
}
print $_."\n";

View File

@ -179,10 +179,13 @@ sub apply {
$word =~ s/\|.+//g; # just first factor
my $lc = lc($word);
print STDERR "considering $word ($lc)...\n" if $VERBOSE;
# don't split frequent words
if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
$lc !~ /[a-zA-Z]/) {; # has to have at least one letter
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
print $factored_word;
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
next;
}

View File

@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
sub extract_sgml_tag_attribute
{
my ($name, $data) = @_;
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
}
#################################

View File

@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
my ($SRC,$INFILE);
my ($SRC,$INFILE,$UNBUFFERED);
die("detruecase.perl < in > out")
unless &GetOptions('headline=s' => \$SRC,
'in=s' => \$INFILE);
'in=s' => \$INFILE,
'b|unbuffered' => \$UNBUFFERED);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);

View File

@ -4,7 +4,7 @@
use strict;
use Getopt::Long "GetOptions";
my ($SRC,$INFILE,$RECASE_MODEL);
my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
my $MOSES = "moses";
my $LANGUAGE = "en"; # English by default;
die("recase.perl --in file --model ini-file > out")
@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
'headline=s' => \$SRC,
'lang=s' => \$LANGUAGE,
'moses=s' => \$MOSES,
'model=s' => \$RECASE_MODEL)
'model=s' => \$RECASE_MODEL,
'b|unbuffered' => \$UNBUFFERED)
&& defined($INFILE)
&& defined($RECASE_MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %treated_languages = map { ($_,1) } qw/en cs/;
die "I don't know any rules for $LANGUAGE. Use 'en' as the default."

View File

@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
my $MODEL;
die("truecase.perl --model truecaser < in > out")
unless &GetOptions('model=s' => \$MODEL);
my ($MODEL, $UNBUFFERED);
die("truecase.perl --model MODEL [-b] < in > out")
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
&& defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");

View File

@ -171,7 +171,7 @@ if ($TIMING)
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
# return: another array cotaining a batch of tokenized texts for the input array
# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;

View File

@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
$opn = "zcat $l1input.gz |";
$opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
$opn = "zcat $l2input.gz |";
$opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@ -160,3 +160,4 @@ sub word_count {
my @w = split(/ /,$line);
return scalar @w;
}

View File

@ -40,7 +40,8 @@ def printUsage():
def main():
parser = optparse.OptionParser()
parser.add_option("-c", "--min-non-initial-rule-count",
action="store", dest="minCount", type="int", default="1",
action="store", dest="minCount",
type="float", default="0.0",
help="prune non-initial rules where count is below N",
metavar="N")
(options, args) = parser.parse_args()