Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2013-03-15 20:38:42 +00:00
commit 18e8f12d5e
65 changed files with 2028 additions and 165 deletions

3
.gitmodules vendored
View File

@ -0,0 +1,3 @@
[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
path = contrib/arrow-pipelines/python/libs/pypeline
url = git://github.com/ianj-als/pypeline.git

View File

@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
Generally, for trouble installing external libraries, you should get support Generally, for trouble installing external libraries, you should get support
directly from the library maker: directly from the library maker:
Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user

2
NOTICE
View File

@ -1,3 +1,5 @@
This code includes data from Daniel Naber's Language Tools (czech abbreviations). This code includes data from Daniel Naber's Language Tools (czech abbreviations).
This code includes data from czech wiktionary (also czech abbreviations). This code includes data from czech wiktionary (also czech abbreviations).

View File

@ -0,0 +1,32 @@
Arrow Based Moses Training Pipeline
===================================
To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
$ git submodule init
This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
$ cd libs/pypeline
$ python setup.py install
Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
Three environment variables need to be set before the manager.py script can be run, they are:
- MOSES_HOME : The directory where Moses has been cloned, or installed,
- IRSTLM : The installation directory of your IRSTLM, and
- GIZA_HOME : The installation directory of GIZA++.
The manager.py script takes four positional command-line arguments:
- The source language code,
- The target language code,
- The source corpus file. This file *must* be cleaned prior to use, and
- The target corpus file. This file *must* be cleaned prior to use.
For example, run the manager.py script with:
$ python manager.py en lt cleantrain.en cleantrain.lt

@ -0,0 +1 @@
Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb

View File

@ -0,0 +1,192 @@
import logging
import os
from concurrent.futures import Future, ThreadPoolExecutor
from functools import partial
from pypeline.helpers.parallel_helpers import eval_pipeline, \
cons_function_component, \
cons_wire, \
cons_split_wire, \
cons_unsplit_wire, \
cons_dictionary_wire
#
# Some logging please
#
FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
logging.basicConfig(format = FORMAT, level = logging.DEBUG)
logger = logging.getLogger("manager")
# Build the pipeline components
def build_components(components, configuration, executor):
pipeline_components = dict()
pipeline_configuration = dict()
for component_id, module_name in components.items():
logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
module = __import__(module_name, fromlist = ['configure', 'initialise'])
# Component builds its own configuration object
config_func = getattr(module, 'configure')
component_config = config_func(configuration)
pipeline_configuration.update(component_config)
# Now build the component
init_func = getattr(module, 'initialise')
component_function = init_func(component_config)
# A wrapper for the component's function that submits to the executor
def get_component_function_wrapper(inner_function, comp_id, mod_name):
def component_function_wrapper(a, s):
logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
(comp_id, mod_name, a, s))
return inner_function(a, s)
return component_function_wrapper
# Arrowize the component
component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
# And store
pipeline_components[component_id] = component
return pipeline_components, pipeline_configuration
# Go!
def main(src_lang, trg_lang, src_filename, trg_filename):
# Global configuration
# One day, this configuration shall be constructed from
# command line options, or a properties file.
configuration = {
'moses_installation_dir': os.environ['MOSES_HOME'],
'irstlm_installation_dir': os.environ['IRSTLM'],
'giza_installation_dir': os.environ['GIZA_HOME'],
'src_lang': src_lang,
'src_tokenisation_dir': './tokenisation',
'trg_lang': trg_lang,
'trg_tokenisation_dir': './tokenisation',
'segment_length_limit': 60,
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': './language-model',
'translation_model_directory': './translation-model',
'mert_working_directory': './mert',
'evaluation_data_size': 100,
'development_data_size': 100
}
# The modules to load
# In the future, the components shall be specified in some kind
# pipeline description file.
component_modules = {
'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
'cleanup': 'training.components.cleanup.cleanup',
'data_split': 'training.components.data_split.data_split',
'irstlm_build': 'training.components.irstlm_build.irstlm_build',
'model_training': 'training.components.model_training.model_training',
'mert': 'training.components.mert.mert'
}
# The thread pool
executor = ThreadPoolExecutor(max_workers = 3)
# Phew, build the required components
components, component_config = build_components(component_modules, configuration, executor)
#
# Wire up components
# Description of wiring should be, in the future, alongside the component
# specification in some kind of confuguration file. Components shall be
# declared then used, i.e., bind a component instance to a unique component
# identifier, then wire component instances together by identifier.
#
#
# Tokenisation of source and target...
#
# IRSTLM Build components
irstlm_build_component = cons_split_wire() >> \
(cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
components['irstlm_build']).second() >> \
cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
'trg_language_model_filename': b['compiled_lm_filename']})
# The complete tokenisation component
tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
irstlm_build_component.second() >> \
cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
'trg_filename': b['tokenised_trg_filename'],
'trg_language_model_filename': b['trg_language_model_filename']})
#
# Cleanup and Data Spliting...
#
#
# A function that clips off the last '.' delimited string
#
def clip_last_bit(filename):
bn = os.path.basename(filename)
directory = os.path.dirname(filename)
bits = bn.split(".")
bits.pop()
return os.path.join(directory, ".".join(bits))
cleanup_datasplit_component = components['cleanup'] >> \
cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
'trg_filename': a['cleaned_trg_filename']}) >> \
components['data_split'] >> \
cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
'eval_src_filename': a['eval_src_filename'],
'eval_trg_filename': a['eval_trg_filename']})
#
# Translation model training
#
translation_model_component = cons_split_wire() >> \
components['model_training'].first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': b['eval_src_filename']})
#
# The whole pipeline
#
pipeline = tokenisation_component >> \
cons_split_wire() >> \
(cleanup_datasplit_component >> translation_model_component).first() >> \
cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
'development_data_filename': clip_last_bit(t['development_data_filename']),
'trg_language_model_filename': b['trg_language_model_filename'],
'trg_language_model_order': 3,
'trg_language_model_type': 9}) >> \
components['mert']
#
# The input to the pipeline
#
value = {'src_filename': src_filename,
'trg_filename': trg_filename}
#
# Evaluate the pipeline
#
logger.info("Evaluating pipeline with input [%s]..." % value)
new_value = eval_pipeline(executor, pipeline, value, component_config)
#
# Wait for all components to finish
#
executor.shutdown(True)
logger.info("Pipeline evaluated to %s" % new_value)
if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

View File

@ -0,0 +1,11 @@
import subprocess
def cat(filename, content):
fh = open(filename, "w")
for line in content:
#print(line, file=fh)
print >> fh, line
fh.close()
def diff(filename1, filename2):
subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)

View File

@ -0,0 +1,125 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits[-1] = "clean"
return ".".join(bits)
def _filter_main(value, config):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,109 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['segment_length'] = args['segment_length_limit']
return result
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
#print(line, ":", n)
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print(l1, end='', file=ofh1)
print(l2, end='', file=ofh2)
def _filter_main(config, value):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
ifh1 = open(value['src_filename'], "r")
ifh2 = open(value['trg_filename'], "r")
ofh1 = open(value['cleaned_src_filename'], "w")
ofh2 = open(value['cleaned_trg_filename'], "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': value['cleaned_src_filename'],
'cleaned_trg_filename': value['cleaned_trg_filename']}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return cons_function_component(_filter_main)
if __name__ == '__main__':
import os
import tempfile
import training.components.shared.test as thelp
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_filename': src_filename[1] + ".clean",
'cleaned_trg_filename': trg_filename[1] + ".clean",
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
run_pipeline(box, box_config, box_eval)
thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
#expected output:
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,146 @@
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['evaluate_size'] = args['evaluation_data_size']
result['development_size'] = args['development_data_size']
return result
def initialise(config):
def _copy(size, inp, ofh1, ofh2):
try:
while size != 0:
(l1, l2) = inp.next()
print >>ofh1, l1,
print >>ofh2, l2,
size -= 1
except StopIteration:
pass
def _make_split_filename(filename, data_set):
bits = filename.split(".")
last = bits.pop()
lang_code = bits.pop()
bits.append(last)
bits.append(data_set)
bits.append(lang_code)
new_filename = ".".join(bits)
return new_filename
def _splitter_main(value, config):
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = value['src_filename']
input_trg_filename = value['trg_filename']
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
inp = iter(zip(ifh1, ifh2))
result = {}
for (data_set, size) in [
('devel', config['development_size']),
('eval', config['evaluate_size']),
('train', -1)
]:
output_src_filename = _make_split_filename(input_src_filename, data_set)
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
ofh1 = open(output_src_filename, "w")
ofh2 = open(output_trg_filename, "w")
_copy(size, inp, ofh1, ofh2)
result[data_set + '_src_filename'] = output_src_filename
result[data_set + '_trg_filename'] = output_trg_filename
return result
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _splitter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {
'evaluation_data_size': 7,
'development_data_size': 13,
}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'devel_src_expected': src_filename[1] + ".devel.expected",
'devel_trg_expected': trg_filename[1] + ".devel.expected",
'eval_src_expected': src_filename[1] + ".eval.expected",
'eval_trg_expected': trg_filename[1] + ".eval.expected",
'train_src_expected': src_filename[1] + ".train.expected",
'train_trg_expected': trg_filename[1] + ".train.expected",
}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
for data_set in ['devel', 'eval', 'train']:
for lang in ['src', 'trg']:
filename = output[data_set + '_' + lang + '_filename']
filename_expected = box_eval[data_set + '_' + lang + '_expected']
thelp.diff(filename_expected, filename)
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line(range(50)))
thelp.cat(box_eval['trg_filename'], _line(range(50)))
#expected output:
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()

View File

@ -0,0 +1,106 @@
import os
import shutil
import subprocess
import tempfile
from pypeline.helpers.helpers import cons_function_component
def configure(args):
config = dict()
config['irstlm_install_directory'] = args['irstlm_installation_dir']
config['smoothing_method'] = args['irstlm_smoothing_method']
config['lm_directory'] = args['language_model_directory']
return config
def initialise(config):
def process(a, s):
# Create the LM directory if we need to
if os.path.exists(s['lm_directory']) is False:
os.makedirs(s['lm_directory'])
# The filename of the file to chew through
start_end_input_filename = a['input_filename']
if os.path.exists(start_end_input_filename) is False:
raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
# Derive the output file name for the add start-end marker processor
filename_bits = os.path.basename(start_end_input_filename).split(".")
filename_bits[2] = "sb";
start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the output file name of the LM build
filename_bits[2] = "lm"
lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# Derive the compiled LM file name
filename_bits[2] = "arpa"
compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
# First thing to do is add start and end markers
start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
infile = open(start_end_input_filename, 'r')
outfile = open(start_end_output_filename, 'w')
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
if return_code:
raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
start_end_input_filename, start_end_output_filename, return_code)
# Next build the language model
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
try:
build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
"-i", start_end_output_filename,
"-t", tmp_dir,
"-p",
"-s", s['smoothing_method'],
"-o", lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
return_code = subprocess.check_call(build_lm_cmdline)
if return_code:
raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
finally:
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
# Compile the LM
lm_filename = lm_filename + ".gz"
compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
"--text", "yes",
lm_filename,
compiled_lm_filename]
print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
return_code = subprocess.check_call(compile_lm_cmdline)
if return_code:
raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
output = {'add_start_end_filename': start_end_output_filename,
'lm_filename': lm_filename,
'compiled_lm_filename': compiled_lm_filename}
print "IRSTLM Build: Output = %s" % output
return output
return process
if __name__ == '__main__':
from pypeline.helpers.helpers import eval_pipeline
lm_dir = os.environ["PWD"]
configuration = {'irstlm_root': os.environ["IRSTLM"],
'irstlm_smoothing_method': 'improved-kneser-ney',
'language_model_directory': lm_dir}
component_config = configure(configuration)
component = initialise(component_config)
value = eval_pipeline(component,
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
component_config)
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
print "Target: %s" % target
if value != target:
raise Exception("Massive fail!")

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['mert_working_dir'] = args['mert_working_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['development_data_filename'])
lm_file = os.path.abspath(a['trg_language_model_filename'])
lm_order = int(a['trg_language_model_order'])
lm_type = int(a['trg_language_model_type'])
orig_moses_ini = os.path.abspath(a['moses_ini_file'])
if not os.path.exists(orig_moses_ini):
raise Exception, "Error: Input moses.ini does not exist"
workdir = os.path.abspath(config['mert_working_dir'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
moses_install_dir = os.path.abspath(config['moses_installation_dir'])
mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
bin_dir = os.path.join(moses_install_dir, 'bin')
moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
src_file = infilename + '.' + config['src_lang']
ref_file = infilename + '.' + config['trg_lang']
logfile = os.path.join(workdir, 'log')
#change lm configuration in moses ini
moses_ini = os.path.join(workdir, 'trained-moses.ini')
cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
cmd = cmd % locals()
os.system(cmd)
#the command
cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
new_mosesini = os.path.join(workdir, 'moses.ini')
if not os.path.exists(new_mosesini):
raise Exception, 'Failed MERT'
return {'moses_ini_file':new_mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.path.abspath('../../../../'),
'mert_working_dir':'../../../../../tuning'}
values = {'development_data_filename':'../../../../../corpus/tune',
'moses_ini_file':'../../../../../model/model/moses.ini',
'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
'trg_language_model_type':9,
'trg_language_model_order':4}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
import os, shutil, subprocess
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['trg_lang'] = args['trg_lang']
result['moses_installation_dir'] = args['moses_installation_dir']
result['external_bin_dir'] = args['giza_installation_dir']
result['model_directory'] = args['translation_model_directory']
return result
def initialise(config):
def process(a, s):
infilename = os.path.abspath(a['training_data_filename'])
workdir = os.path.abspath(config['model_directory'])
#simply call the training perl script
#remove the workdir if it is already there
if os.path.exists(workdir):
shutil.rmtree(workdir)
os.makedirs(workdir)
#local vars
train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
src_lang = config['src_lang'].lower()
trg_lang = config['trg_lang'].lower()
external_bin = os.path.abspath(config['external_bin_dir'])
#create a dummy lm file
dummy_lmfile = workdir + os.sep + 'dummy.lm'
f = open(dummy_lmfile, 'w')
print >> f, "dummy lm file"
f.close()
logfile = workdir + os.sep + 'log'
#the command
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
cmd = cmd % locals()
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
#check the moses ini
mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
if not os.path.exists(mosesini):
raise Exception, 'Failed training model'
return {'moses_ini_file':mosesini}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'en',
'trg_lang':'lt',
'moses_installation_dir':os.environ['MOSES_HOME'],
'giza_installation_dir':os.environ['GIZA_HOME'],
'translation_model_directory':'model-dir'}
values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(box_config)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['src_lang'] = args['src_lang']
result['src_tokenisation_dir'] = args['src_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['src_filename']
outfilename = Tokenizer.batch_tokenise(
config['src_lang'],
config['moses_installation_dir'],
infilename,
config['src_tokenisation_dir'])
return {'tokenised_src_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'src_lang':'de',
'src_tokenisation_dir':'tmptok',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'src_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -0,0 +1,3 @@
asdfweoih
awfwoeijf awefo
what's this

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
import sys, os, subprocess
class Tokenizer:
@staticmethod
def batch_tokenise(lang, mosesdir, infilename, workdir):
print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
if not os.path.exists(workdir):
os.makedirs(workdir)
tok = Tokenizer(lang, mosesdir)
basefilename = os.path.basename(infilename)
outfilename = workdir + os.sep + basefilename + '.tok'
tok.file_tokenise(infilename, outfilename)
return outfilename
def __init__(self, lang, mosesdir):
self.arrows = None
self.lang = lang
#check the perl tokenizer is here
#path = os.path.dirname(os.path.abspath(__file__))
path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
self.perltok = path + os.sep + 'tokenizer.perl'
if not os.path.exists(path):
raise Exception, "Perl tokenizer does not exists"
def file_tokenise(self, infilename, outfilename):
cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
pipe.wait()
if __name__ == '__main__':
#do some test
pass

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
import os
from tokenizer import Tokenizer
from pypeline.helpers.helpers import cons_function_component
def configure(args):
result = {}
result['trg_lang'] = args['trg_lang']
result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
result['moses_installation_dir'] = args['moses_installation_dir']
return result
def initialise(config):
def process(a, s):
infilename = a['trg_filename']
outfilename = Tokenizer.batch_tokenise(
config['trg_lang'],
config['moses_installation_dir'],
infilename,
config['trg_tokenisation_dir'])
return {'tokenised_trg_filename':outfilename}
return process
if __name__ == '__main__':
def __test():
configuration = {'trg_lang':'de',
'trg_tokenisation_dir':'tmptoktrg',
'moses_installation_dir':os.path.abspath('../../../../')}
values = {'trg_filename':'tmp.de'}
from pypeline.helpers.helpers import run_pipeline
box_config = configure(configuration)
box = initialise(configuration)
print run_pipeline(box, values, None)
#do some test
__test()

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -133,8 +133,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope" versionNumber="1"> <storageModule moduleId="refreshScope" versionNumber="2">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/> <configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
</configuration>
</storageModule> </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -18,11 +18,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool> </tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
@ -119,5 +122,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope"/> <storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/extractor"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/extractor"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject> </cproject>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -131,7 +131,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope"/> <storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/lm"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/lm"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject> </cproject>

View File

@ -141,11 +141,6 @@
<type>1</type> <type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI> <locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
</link> </link>
<link>
<name>build_binary.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
</link>
<link> <link>
<name>clean.sh</name> <name>clean.sh</name>
<type>1</type> <type>1</type>
@ -176,11 +171,6 @@
<type>1</type> <type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI> <locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
</link> </link>
<link>
<name>fragment.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
</link>
<link> <link>
<name>left.hh</name> <name>left.hh</name>
<type>1</type> <type>1</type>
@ -211,11 +201,6 @@
<type>1</type> <type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI> <locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
</link> </link>
<link>
<name>max_order.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
</link>
<link> <link>
<name>max_order.hh</name> <name>max_order.hh</name>
<type>1</type> <type>1</type>
@ -241,11 +226,6 @@
<type>1</type> <type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI> <locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
</link> </link>
<link>
<name>ngram_query.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
</link>
<link> <link>
<name>ngram_query.hh</name> <name>ngram_query.hh</name>
<type>1</type> <type>1</type>

View File

@ -7,7 +7,7 @@
<externalSetting> <externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/> <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting> </externalSetting>
</externalSettings> </externalSettings>
<extensions> <extensions>
@ -23,13 +23,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/> <targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/> <builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/> <option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath"> <option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option> </option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool> </tool>
@ -45,11 +46,8 @@
</tool> </tool>
</toolChain> </toolChain>
</folderInfo> </folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
</fileInfo>
<sourceEntries> <sourceEntries>
<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/> <entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries> </sourceEntries>
</configuration> </configuration>
</storageModule> </storageModule>
@ -61,7 +59,7 @@
<externalSetting> <externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/> <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting> </externalSetting>
</externalSettings> </externalSettings>
<extensions> <extensions>
@ -119,5 +117,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope"/> <storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject> </cproject>

View File

@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,7 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths"> <option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@ -154,8 +155,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope" versionNumber="1"> <storageModule moduleId="refreshScope" versionNumber="2">
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/> <configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
</configuration>
</storageModule> </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -46,6 +46,8 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths"> <option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/> <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
@ -155,8 +157,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope" versionNumber="1"> <storageModule moduleId="refreshScope" versionNumber="2">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/> <configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
</configuration>
</storageModule> </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

View File

@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?> <?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings"> <storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512"> <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug"> <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@ -9,7 +7,7 @@
<externalSetting> <externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
<entry flags="RESOLVED" kind="libraryFile" name="moses"/> <entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
</externalSetting> </externalSetting>
</externalSettings> </externalSettings>
<extensions> <extensions>
@ -26,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -152,8 +150,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope" versionNumber="1"> <storageModule moduleId="refreshScope" versionNumber="2">
<resource resourceType="PROJECT" workspacePath="/moses"/> <configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/moses"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/moses"/>
</configuration>
</storageModule> </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject> </cproject>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@ -127,6 +127,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope"/> <storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/search"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/search"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject> </cproject>

View File

@ -156,11 +156,6 @@
<type>1</type> <type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI> <locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
</link> </link>
<link>
<name>vertex_generator.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
</link>
<link> <link>
<name>vertex_generator.hh</name> <name>vertex_generator.hh</name>
<type>1</type> <type>1</type>

View File

@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath=""> <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@ -136,8 +136,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo> </scannerConfigBuildInfo>
</storageModule> </storageModule>
<storageModule moduleId="refreshScope" versionNumber="1"> <storageModule moduleId="refreshScope" versionNumber="2">
<resource resourceType="PROJECT" workspacePath="/util"/> <configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/util"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/util"/>
</configuration>
</storageModule> </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>

42
contrib/rpm/README Normal file
View File

@ -0,0 +1,42 @@
Building Moses RPM
==================
*** WARNING ***
Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
*** WARNING ***
Building the RPM SPEC file
--------------------------
The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
- The Git repository from which an installer will be built,
- The branch in the Git repository to build, and
- The version of the installed Moses distribution.
For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
Building the RPM
----------------
Change directory to $HOME/rpmbuild, and build the binary RPM with:
$ rpmbuild -bb SPECS/moses.spec
This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
Building a Debian package
-------------------------
The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
https://help.ubuntu.com/community/RPM/AlienHowto

63
contrib/rpm/build_source.sh Executable file
View File

@ -0,0 +1,63 @@
#!/bin/bash
BRANCH="master"
declare -i NO_RPM_BUILD=0
declare -r RPM_VERSION_TAG="___RPM_VERSION__"
function usage() {
echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
exit 1
}
if [ $# -lt 4 ]; then
usage
fi
while getopts r:b:v:nh OPTION
do
case "$OPTION" in
r) REPO="${OPTARG}";;
b) BRANCH="${OPTARG}";;
v) VERSION="${OPTARG}";;
n) NO_RPM_BUILD=1;;
[h\?]) usage;;
esac
done
if [ ! -d ./rpmbuild ]; then
echo "RPM build directory not in current working direcotry"
exit 1
fi
declare -r MOSES_DIR="moses-${VERSION}"
git clone ${REPO} ${MOSES_DIR}
if [ $? -ne 0 ]; then
echo "Failed to clone Git repository ${REPO}"
exit 3
fi
cd ${MOSES_DIR}
git checkout ${BRANCH}
if [ $? -ne 0 ]; then
echo "Failed to checkout branch ${BRANCH}"
exit 3
fi
cd ..
tar -cf moses-${VERSION}.tar ${MOSES_DIR}
gzip -f9 moses-${VERSION}.tar
if [ ${NO_RPM_BUILD} -eq 0 ]; then
if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
mkdir -p ${HOME}/rpmbuild/SPECS
fi
eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
mkdir -p ${HOME}/rpmbuild/SOURCES
fi
mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
fi
rm -Rf ${MOSES_DIR}

View File

@ -0,0 +1,65 @@
Name: moses
Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
Version: ___RPM_VERSION__
Release: 1
URL: http://www.statmt.org/moses/
Source0: %{name}-%{version}.tar.gz
License: LGPL
Group: Development/Tools
Vendor: Capita Translation and Interpreting
Packager: Ian Johnson <ian.johnson@capita-ti.com>
Requires: boost >= 1.48, python >= 2.6, perl >= 5
BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
%description
Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
%prep
%setup -q
mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
cd $RPM_BUILD_DIR
tar -zxf irstlm-5.70.04.tgz
tar -zxf giza-pp-v1.0.7.tgz
cd irstlm-5.70.04
bash regenerate-makefiles.sh --force
./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
make
make install
cd ../giza-pp
make
cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
%build
./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
%install
mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
cp -R bin $RPM_BUILD_ROOT/opt/moses
cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
%clean
%files
%defattr(-,root,root)
/opt/moses/bin/*
/opt/moses/scripts/analysis/*
/opt/moses/scripts/ems/*
/opt/moses/scripts/generic/*
/opt/moses/scripts/other/*
/opt/moses/scripts/recaser/*
/opt/moses/scripts/regression-testing/*
/opt/moses/scripts/share/*
/opt/moses/scripts/tokenizer/*
/opt/moses/scripts/training/*
/opt/moses/irstlm-5.70.04/*
/opt/moses/giza++-v1.0.7/*

View File

@ -620,12 +620,29 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
template <class T> template <class T>
void ShiftOffsets(vector<T> &offsets, T shift) void ShiftOffsets(vector<T> &offsets, T shift)
{ {
T currPos = shift;
for (size_t i = 0; i < offsets.size(); ++i) { for (size_t i = 0; i < offsets.size(); ++i) {
shift += offsets[i]; if (offsets[i] == 0) {
offsets[i] += shift; offsets[i] = currPos;
++currPos;
}
else {
currPos += offsets[i];
}
} }
} }
size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
{
size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
for (size_t i = 0; i < prevHypos.size(); ++i) {
size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
ret -= (childSize - 1);
}
return ret;
}
size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget) size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
{ {
const ChartHypothesis *hypo = &node.GetHypothesis(); const ChartHypothesis *hypo = &node.GetHypothesis();
@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); size_t thisSourceSize = CalcSourceSize(hypo);
// position of each terminal word in translation rule, irrespective of alignment
// if non-term, number is undefined
vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0); vector<size_t> targetOffsets(tp.GetSize(), 0);
const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren(); const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const ChartTrellisNode &prevNode = *prevNodes[sourceInd]; const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
// 1st. calc source size // calc source size
size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered(); size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize; sourceOffsets[sourcePos] = sourceSize;
// 2nd. calc target size. Recursively look thru child hypos // calc target size.
// Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize; size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget); size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
targetOffsets[targetPos] = targetSize; targetOffsets[targetPos] = targetSize;
@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
} }
} }
// 3rd. shift offsets // convert position within translation rule to absolute position within
// source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget); ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo // get alignments from this hypo
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target // add to output arg, offsetting by source & target
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { AlignmentInfo::const_iterator iter;
const set<size_t> &targets = retAlignmentsS2T[source]; for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
set<size_t>::const_iterator iter; const std::pair<size_t,size_t> &align = *iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) { size_t relSource = align.first;
size_t target = *iter; size_t relTarget = align.second;
pair<size_t, size_t> alignPoint(source + sourceOffsets[source] size_t absSource = sourceOffsets[relSource];
,target + targetOffsets[target]); size_t absTarget = targetOffsets[relTarget];
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second);
} pair<size_t, size_t> alignPoint(absSource, absTarget);
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second);
} }
return totalTargetSize; return totalTargetSize;
@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
{ {
ostringstream out; ostringstream out;
Alignments retAlign; if (hypo) {
OutputAlignment(retAlign, hypo, 0); Alignments retAlign;
OutputAlignment(retAlign, hypo, 0);
// output alignments // output alignments
Alignments::const_iterator iter; Alignments::const_iterator iter;
for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
const pair<size_t, size_t> &alignPoint = *iter; const pair<size_t, size_t> &alignPoint = *iter;
out << alignPoint.first << "-" << alignPoint.second << " "; out << alignPoint.first << "-" << alignPoint.second << " ";
}
} }
out << endl; out << endl;
@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); size_t thisSourceSize = CalcSourceSize(hypo);
// position of each terminal word in translation rule, irrespective of alignment
// if non-term, number is undefined
vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0); vector<size_t> targetOffsets(tp.GetSize(), 0);
const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos(); const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const ChartHypothesis *prevHypo = prevHypos[sourceInd]; const ChartHypothesis *prevHypo = prevHypos[sourceInd];
// 1st. calc source size // calc source size
size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered(); size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize; sourceOffsets[sourcePos] = sourceSize;
// 2nd. calc target size. Recursively look thru child hypos // calc target size.
// Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize; size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget); size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
targetOffsets[targetPos] = targetSize; targetOffsets[targetPos] = targetSize;
@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
} }
} }
// 3rd. shift offsets // convert position within translation rule to absolute position within
// source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget); ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo // get alignments from this hypo
vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target // add to output arg, offsetting by source & target
for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { AlignmentInfo::const_iterator iter;
const set<size_t> &targets = retAlignmentsS2T[source]; for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
set<size_t>::const_iterator iter; const std::pair<size_t,size_t> &align = *iter;
for (iter = targets.begin(); iter != targets.end(); ++iter) { size_t relSource = align.first;
size_t target = *iter; size_t relTarget = align.second;
pair<size_t, size_t> alignPoint(source + sourceOffsets[source] size_t absSource = sourceOffsets[relSource];
,target + targetOffsets[target]); size_t absTarget = targetOffsets[relTarget];
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second); pair<size_t, size_t> alignPoint(absSource, absTarget);
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
CHECK(ret.second);
}
} }
return totalTargetSize; return totalTargetSize;

View File

@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
} }
} }
ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
const StaticData &staticData = StaticData::Instance();
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
return file;
}
/*** /***
* print surface factor only for the given phrase * print surface factor only for the given phrase
*/ */
@ -262,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
out << std::endl; out << std::endl;
} }
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
{
std::vector<const Hypothesis *> edges;
const Hypothesis *currentHypo = hypo;
while (currentHypo) {
edges.push_back(currentHypo);
currentHypo = currentHypo->GetPrevHypo();
}
OutputAlignment(out, edges);
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges) void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{ {
ostringstream out; ostringstream out;

View File

@ -117,6 +117,8 @@ public:
return *m_outputSearchGraphStream; return *m_outputSearchGraphStream;
} }
std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
std::ostream &GetDetailedTranslationReportingStream() { std::ostream &GetDetailedTranslationReportingStream() {
assert (m_detailedTranslationReportingStream); assert (m_detailedTranslationReportingStream);
return *m_detailedTranslationReportingStream; return *m_detailedTranslationReportingStream;
@ -137,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo); void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
} }

View File

@ -83,14 +83,18 @@ public:
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector, OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
OutputCollector* detailedTranslationCollector, OutputCollector* detailedTranslationCollector,
OutputCollector* alignmentInfoCollector, OutputCollector* alignmentInfoCollector,
OutputCollector* unknownsCollector) : OutputCollector* unknownsCollector,
bool outputSearchGraphSLF,
bool outputSearchGraphHypergraph) :
m_source(source), m_lineNumber(lineNumber), m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector), m_latticeSamplesCollector(latticeSamplesCollector),
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector), m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
m_detailedTranslationCollector(detailedTranslationCollector), m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector), m_alignmentInfoCollector(alignmentInfoCollector),
m_unknownsCollector(unknownsCollector) {} m_unknownsCollector(unknownsCollector),
m_outputSearchGraphSLF(outputSearchGraphSLF),
m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
/** Translate one sentence /** Translate one sentence
* gets called by main function implemented at end of this source file */ * gets called by main function implemented at end of this source file */
@ -143,6 +147,42 @@ public:
#endif #endif
} }
// Output search graph in HTK standard lattice format (SLF)
if (m_outputSearchGraphSLF) {
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
if (file->is_open() && file->good()) {
ostringstream out;
fix(out,PRECISION);
manager.OutputSearchGraphAsSLF(m_lineNumber, out);
*file << out.str();
file -> flush();
} else {
TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
}
}
// Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
if (m_outputSearchGraphHypergraph) {
stringstream fileName;
fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
if (file->is_open() && file->good()) {
ostringstream out;
fix(out,PRECISION);
manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
*file << out.str();
file -> flush();
} else {
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
}
file -> close();
delete file;
}
// apply decision rule and output best translation(s) // apply decision rule and output best translation(s)
if (m_outputCollector) { if (m_outputCollector) {
ostringstream out; ostringstream out;
@ -157,7 +197,7 @@ public:
// MAP decoding: best hypothesis // MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL; const Hypothesis* bestHypo = NULL;
if (!staticData.UseMBR()) if (!staticData.UseMBR())
{ {
bestHypo = manager.GetBestHypothesis(); bestHypo = manager.GetBestHypothesis();
if (bestHypo) { if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) { if (staticData.IsPathRecoveryEnabled()) {
@ -174,13 +214,18 @@ public:
staticData.GetOutputFactorOrder(), staticData.GetOutputFactorOrder(),
staticData.GetReportSegmentation(), staticData.GetReportSegmentation(),
staticData.GetReportAllFactors()); staticData.GetReportAllFactors());
if (staticData.PrintAlignmentInfo()) {
out << "||| ";
OutputAlignment(out, bestHypo);
}
OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo); OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
IFVERBOSE(1) { IFVERBOSE(1) {
debug << "BEST TRANSLATION: " << *bestHypo << endl; debug << "BEST TRANSLATION: " << *bestHypo << endl;
} }
} }
out << endl; out << endl;
} }
// MBR decoding (n-best MBR, lattice MBR, consensus) // MBR decoding (n-best MBR, lattice MBR, consensus)
else else
@ -311,6 +356,8 @@ private:
OutputCollector* m_detailedTranslationCollector; OutputCollector* m_detailedTranslationCollector;
OutputCollector* m_alignmentInfoCollector; OutputCollector* m_alignmentInfoCollector;
OutputCollector* m_unknownsCollector; OutputCollector* m_unknownsCollector;
bool m_outputSearchGraphSLF;
bool m_outputSearchGraphHypergraph;
std::ofstream *m_alignmentStream; std::ofstream *m_alignmentStream;
@ -367,6 +414,63 @@ static void ShowWeights()
} }
size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
{
size_t numScoreComps = ff->GetNumScoreComponents();
if (numScoreComps != ScoreProducer::unlimited) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
if (numScoreComps > 1) {
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
<< i
<< "=" << values[i] << endl;
}
} else {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
<< "=" << values[0] << endl;
}
return index+numScoreComps;
} else {
cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
assert(false);
return 0;
}
}
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
}
}
} //namespace } //namespace
/** main function of the command line version of the decoder **/ /** main function of the command line version of the decoder **/
@ -391,20 +495,20 @@ int main(int argc, char** argv)
// load all the settings into the Parameter class // load all the settings into the Parameter class
// (stores them as strings, or array of strings) // (stores them as strings, or array of strings)
Parameter* params = new Parameter(); Parameter params;
if (!params->LoadParam(argc,argv)) { if (!params.LoadParam(argc,argv)) {
exit(1); exit(1);
} }
// initialize all "global" variables, which are stored in StaticData // initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc. // note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) { if (!StaticData::LoadDataStatic(&params, argv[0])) {
exit(1); exit(1);
} }
// setting "-show-weights" -> just dump out weights and exit // setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) { if (params.isParamSpecified("show-weights")) {
ShowWeights(); ShowWeights();
exit(0); exit(0);
} }
@ -430,6 +534,14 @@ int main(int argc, char** argv)
TRACE_ERR(weights); TRACE_ERR(weights);
TRACE_ERR("\n"); TRACE_ERR("\n");
} }
if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
OutputFeatureWeightsForHypergraph(*weightsOut);
weightsOut->flush();
weightsOut->close();
delete weightsOut;
}
// initialize output streams // initialize output streams
// note: we can't just write to STDOUT or files // note: we can't just write to STDOUT or files
@ -533,7 +645,9 @@ int main(int argc, char** argv)
searchGraphCollector.get(), searchGraphCollector.get(),
detailedTranslationCollector.get(), detailedTranslationCollector.get(),
alignmentInfoCollector.get(), alignmentInfoCollector.get(),
unknownsCollector.get() ); unknownsCollector.get(),
staticData.GetOutputSearchGraphSLF(),
staticData.GetOutputSearchGraphHypergraph());
// execute task // execute task
#ifdef WITH_THREADS #ifdef WITH_THREADS
pool.Submit(task); pool.Submit(task);
@ -551,6 +665,8 @@ int main(int argc, char** argv)
pool.Stop(true); //flush remaining jobs pool.Stop(true); //flush remaining jobs
#endif #endif
delete ioWrapper;
} catch (const std::exception &e) { } catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << std::endl; std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE; return EXIT_FAILURE;

View File

@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
m_emptyAlignmentInfo = Add(pairs); m_emptyAlignmentInfo = Add(pairs);
} }
AlignmentInfoCollection::~AlignmentInfoCollection()
{}
const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
{ {
return *m_emptyAlignmentInfo; return *m_emptyAlignmentInfo;

View File

@ -55,6 +55,7 @@ class AlignmentInfoCollection
//! Only a single static variable should be created. //! Only a single static variable should be created.
AlignmentInfoCollection(); AlignmentInfoCollection();
~AlignmentInfoCollection();
static AlignmentInfoCollection s_instance; static AlignmentInfoCollection s_instance;

View File

@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
*/ */
const StaticData &staticData = StaticData::Instance(); const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize(); size_t nBestSize = staticData.GetNBestSize();
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ; bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5) { if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
// prune arc list only if there too many arcs // prune arc list only if there too many arcs

View File

@ -36,8 +36,9 @@ using namespace std;
namespace Moses namespace Moses
{ {
LanguageModelSingleFactor::~LanguageModelSingleFactor() {} LanguageModelSingleFactor::~LanguageModelSingleFactor()
{
}
struct PointerState : public FFState { struct PointerState : public FFState {
const void* lmstate; const void* lmstate;
@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
m_beginSentenceState = new PointerState(NULL); m_beginSentenceState = new PointerState(NULL);
} }
LanguageModelPointerState::~LanguageModelPointerState() {} LanguageModelPointerState::~LanguageModelPointerState()
{
delete m_nullContextState;
delete m_beginSentenceState;
}
const FFState *LanguageModelPointerState::GetNullContextState() const const FFState *LanguageModelPointerState::GetNullContextState() const
{ {

View File

@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif #endif
#include <algorithm> #include <algorithm>
#include <limits>
#include <cmath> #include <cmath>
#include <limits>
#include <map>
#include <set>
#include "Manager.h" #include "Manager.h"
#include "TypeDef.h" #include "TypeDef.h"
#include "Util.h" #include "Util.h"
@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "rule.pb.h" #include "rule.pb.h"
#endif #endif
#include "util/exception.hh"
using namespace std; using namespace std;
namespace Moses namespace Moses
{ {
Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system) Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
:m_lineNumber(lineNumber) :m_system(system)
,m_system(system)
,m_transOptColl(source.CreateTranslationOptionCollection(system)) ,m_transOptColl(source.CreateTranslationOptionCollection(system))
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl)) ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,interrupted_flag(0) ,interrupted_flag(0)
,m_hypoId(0) ,m_hypoId(0)
,m_lineNumber(lineNumber)
,m_source(source) ,m_source(source)
{ {
m_system->InitializeBeforeSentenceProcessing(source); m_system->InitializeBeforeSentenceProcessing(source);
@ -628,6 +632,420 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
} }
void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
}
}
void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
// outputSearchGraphStream << endl;
// outputSearchGraphStream << (*hypo) << endl;
// const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
// outputSearchGraphStream << scoreCollection << endl;
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
}
}
void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
size_t featureIndex = 1;
for (size_t i = 0; i < sff.size(); ++i) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
}
for (size_t i = 0; i < slf.size(); ++i) {
if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
slf[i]->GetScoreProducerWeightShortName() != "tm" &&
slf[i]->GetScoreProducerWeightShortName() != "I" &&
slf[i]->GetScoreProducerWeightShortName() != "g")
{
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
}
}
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
for( size_t i=0; i<pds.size(); i++ ) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
}
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for( size_t i=0; i<gds.size(); i++ ) {
featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
}
}
size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
size_t numScoreComps = ff->GetNumScoreComponents();
if (numScoreComps != ScoreProducer::unlimited) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
<< " " << ff->GetScoreProducerWeightShortName()
<< " " << (i+1) << " of " << numScoreComps << endl
<< "x" << (index+i) << "scale=" << values[i] << endl;
}
return index+numScoreComps;
} else {
cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
assert(false);
return 0;
}
}
size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
// { const FeatureFunction* sp = ff;
// const FVector& m_scores = scoreCollection.GetScoresVector();
// FVector& scores = const_cast<FVector&>(m_scores);
// std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
// // std::cout << "prefix==" << prefix << endl;
// // cout << "m_scores==" << m_scores << endl;
// // cout << "m_scores.size()==" << m_scores.size() << endl;
// // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
// // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
// // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
// // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
// // }
// for(int i=0, n=v.size(); i<n; i+=1) {
// // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
// }
// }
// FVector featureValues = scoreCollection.GetVectorForProducer(ff);
// outputSearchGraphStream << featureValues << endl;
const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();//featureValues.coreSize();
// if (numScoreComps != ScoreProducer::unlimited) {
// vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
}
return index+numScoreComps;
// } else {
// cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
// assert(false);
// return 0;
// }
}
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo) {
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
}
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();
if (numScoreComps > 1) {
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " ";
}
} else {
outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " ";
}
return index+numScoreComps;
}
/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
{
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
map<int,int> mosesIDToHypergraphID;
// map<int,int> hypergraphIDToMosesID;
set<int> terminalNodes;
multimap<int,int> hypergraphIDToArcs;
long numNodes = 0;
long endNode = 0;
{
long hypergraphHypothesisID = 0;
for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
// Get an id number for the previous hypothesis
const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
if (prevHypo!=NULL) {
int mosesPrevHypothesisID = prevHypo->GetId();
if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
hypergraphHypothesisID += 1;
}
}
// Get an id number for this hypothesis
int mosesHypothesisID;
if (searchGraph[arcNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
}
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
// hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
// Final arc to end node, representing the end of the sentence </s>
terminalNodes.insert(hypergraphHypothesisID);
}
hypergraphHypothesisID += 1;
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
}
// Unique end node
endNode = hypergraphHypothesisID;
// mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
numNodes = endNode + 1;
}
long numArcs = searchGraph.size() + terminalNodes.size();
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
// int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
if (count > 0) {
outputSearchGraphStream << count << endl;
pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
int lineNumber = (*it).second;
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
int mosesHypothesisID;// = thisHypo->GetId();
if (searchGraph[lineNumber].recombinationHypo) {
mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
} else {
mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
}
// int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
UTIL_THROW_IF(
(hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
util::Exception,
"Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
"Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
". There are " << numNodes << " nodes in the search lattice."
);
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
if (prevHypo==NULL) {
outputSearchGraphStream << "<s> ||| " << endl;
} else {
int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
UTIL_THROW_IF(
(startNode >= hypergraphHypothesisID),
util::Exception,
"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
"The nodes must be output in topological order. The code attempted to violate this restriction."
);
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
int targetWordCount = targetPhrase.GetSize();
outputSearchGraphStream << "[" << startNode << "]";
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
}
outputSearchGraphStream << " ||| ";
OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
outputSearchGraphStream << endl;
}
}
}
}
// Print node and arc(s) for end of sentence </s>
outputSearchGraphStream << terminalNodes.size() << endl;
for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
}
}
/**! Output search graph in HTK standard lattice format (SLF) */
void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
{
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
long numArcs = 0;
long numNodes = 0;
map<int,int> nodes;
set<int> terminalNodes;
// Unique start node
nodes[0] = 0;
for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
numArcs += targetWordCount;
int hypothesisID = searchGraph[arcNumber].hypo->GetId();
if (nodes.count(hypothesisID) == 0) {
numNodes += targetWordCount;
nodes[hypothesisID] = numNodes;
//numNodes += 1;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
numArcs += 1;
}
}
}
numNodes += 1;
// Unique end node
nodes[numNodes] = numNodes;
outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
outputSearchGraphStream << "VERSION=1.1" << endl;
outputSearchGraphStream << "base=2.71828182845905" << endl;
outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
outputSearchGraphStream << "LINKS=" << numArcs << endl;
OutputFeatureWeightsForSLF(outputSearchGraphStream);
for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
if (prevHypo) {
int startNode = nodes[prevHypo->GetId()];
int endNode = nodes[thisHypo->GetId()];
bool terminalNode = (searchGraph[lineNumber].forward == -1);
const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
int targetWordCount = targetPhrase.GetSize();
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
int x = (targetWordCount-targetWordIndex);
outputSearchGraphStream << "J=" << arcNumber;
if (targetWordIndex==0) {
outputSearchGraphStream << " S=" << startNode;
} else {
outputSearchGraphStream << " S=" << endNode - x;
}
outputSearchGraphStream << " E=" << endNode - (x-1)
<< " W=" << targetPhrase.GetWord(targetWordIndex);
OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
outputSearchGraphStream << endl;
arcNumber += 1;
}
if (terminalNode && terminalNodes.count(endNode) == 0) {
terminalNodes.insert(endNode);
outputSearchGraphStream << "J=" << arcNumber
<< " S=" << endNode
<< " E=" << numNodes
<< endl;
arcNumber += 1;
}
}
}
}
void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream, void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
const SearchGraphNode& searchNode) const SearchGraphNode& searchNode)
{ {

View File

@ -93,6 +93,19 @@ class Manager
Manager(Manager const&); Manager(Manager const&);
void operator=(Manager const&); void operator=(Manager const&);
const TranslationSystem* m_system; const TranslationSystem* m_system;
private:
// Helper functions to output search graph in HTK standard lattice format
void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
// Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
protected: protected:
// data // data
// InputType const& m_source; /**< source sentence to be translated */ // InputType const& m_source; /**< source sentence to be translated */
@ -103,6 +116,7 @@ protected:
size_t interrupted_flag; size_t interrupted_flag;
std::auto_ptr<SentenceStats> m_sentenceStats; std::auto_ptr<SentenceStats> m_sentenceStats;
int m_hypoId; //used to number the hypos as they are created. int m_hypoId; //used to number the hypos as they are created.
size_t m_lineNumber;
void GetConnectedGraph( void GetConnectedGraph(
std::map< int, bool >* pConnected, std::map< int, bool >* pConnected,
@ -113,7 +127,6 @@ protected:
public: public:
size_t m_lineNumber;
InputType const& m_source; /**< source sentence to be translated */ InputType const& m_source; /**< source sentence to be translated */
Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system); Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
~Manager(); ~Manager();
@ -137,6 +150,8 @@ public:
#endif #endif
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const; void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
const InputType& GetSource() const { const InputType& GetSource() const {
return m_source; return m_source;

View File

@ -130,6 +130,8 @@ Parameter::Parameter()
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename"); AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format"); AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
#ifdef HAVE_PROTOBUF #ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
@ -177,6 +179,7 @@ Parameter::Parameter()
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory"); AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory"); AddParam("minphr-memory", "Load phrase table in minphr format into memory");
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false"); AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false"); AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("alignment-output-file", "print output word alignments into given file"); AddParam("alignment-output-file", "print output word alignments into given file");

View File

@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
} }
} }
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
// factor delimiter // factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) { if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false ); SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment //word-to-word alignment
// alignments
SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
if (m_PrintAlignmentInfo) {
m_needAlignmentInfo = true;
}
if(m_parameter->GetParam("sort-word-alignment").size()) {
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
}
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false ); SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) { if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true; m_needAlignmentInfo = true;
@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
} }
m_outputSearchGraph = true; m_outputSearchGraph = true;
m_outputSearchGraphExtended = true; m_outputSearchGraphExtended = true;
} else } else {
m_outputSearchGraph = false; m_outputSearchGraph = false;
}
if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
m_outputSearchGraphSLF = true;
} else {
m_outputSearchGraphSLF = false;
}
if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
m_outputSearchGraphHypergraph = true;
} else {
m_outputSearchGraphHypergraph = false;
}
#ifdef HAVE_PROTOBUF #ifdef HAVE_PROTOBUF
if (m_parameter->GetParam("output-search-graph-pb").size() > 0) { if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
if (m_parameter->GetParam("output-search-graph-pb").size() != 1) { if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {

View File

@ -171,6 +171,7 @@ protected:
bool m_reportAllFactorsNBest; bool m_reportAllFactorsNBest;
std::string m_detailedTranslationReportingFilePath; std::string m_detailedTranslationReportingFilePath;
bool m_onlyDistinctNBest; bool m_onlyDistinctNBest;
bool m_PrintAlignmentInfo;
bool m_needAlignmentInfo; bool m_needAlignmentInfo;
bool m_PrintAlignmentInfoNbest; bool m_PrintAlignmentInfoNbest;
@ -216,6 +217,8 @@ protected:
bool m_outputWordGraph; //! whether to output word graph bool m_outputWordGraph; //! whether to output word graph
bool m_outputSearchGraph; //! whether to output search graph bool m_outputSearchGraph; //! whether to output search graph
bool m_outputSearchGraphExtended; //! ... in extended format bool m_outputSearchGraphExtended; //! ... in extended format
bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
#ifdef HAVE_PROTOBUF #ifdef HAVE_PROTOBUF
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif #endif
@ -458,7 +461,7 @@ public:
return m_nBestFilePath; return m_nBestFilePath;
} }
bool IsNBestEnabled() const { bool IsNBestEnabled() const {
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
#ifdef HAVE_PROTOBUF #ifdef HAVE_PROTOBUF
|| m_outputSearchGraphPB || m_outputSearchGraphPB
#endif #endif
@ -631,6 +634,12 @@ public:
bool GetOutputSearchGraphExtended() const { bool GetOutputSearchGraphExtended() const {
return m_outputSearchGraphExtended; return m_outputSearchGraphExtended;
} }
bool GetOutputSearchGraphSLF() const {
return m_outputSearchGraphSLF;
}
bool GetOutputSearchGraphHypergraph() const {
return m_outputSearchGraphHypergraph;
}
#ifdef HAVE_PROTOBUF #ifdef HAVE_PROTOBUF
bool GetOutputSearchGraphPB() const { bool GetOutputSearchGraphPB() const {
return m_outputSearchGraphPB; return m_outputSearchGraphPB;
@ -722,6 +731,9 @@ public:
const std::string &GetAlignmentOutputFile() const { const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile; return m_alignmentOutputFile;
} }
bool PrintAlignmentInfo() const {
return m_PrintAlignmentInfo;
}
bool PrintAlignmentInfoInNbest() const { bool PrintAlignmentInfoInNbest() const {
return m_PrintAlignmentInfoNbest; return m_PrintAlignmentInfoNbest;
} }

View File

@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (kneserNeyFlag) { if (kneserNeyFlag) {
float D = kneserNey_D3; float D = kneserNey_D3;
if (countEF < 2) D = kneserNey_D1; if (countEF < 2) D = kneserNey_D1;
if (countEF < 3) D = kneserNey_D2; else if (countEF < 3) D = kneserNey_D2;
if (D > countEF) D = countEF - 0.01; // sanity constraint if (D > countEF) D = countEF - 0.01; // sanity constraint
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float p_b_E = n1_E / totalCount; // target phrase prob based on distinct

View File

@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag()) if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo; outextractstrOrientation << orientationInfo;
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.getInstanceWeightsFile().length()) { if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) { if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString; outextractstr << " ||| " << sentence.weightString;
@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
} }
} }
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}
if (m_options.isTranslationFlag()) outextractstr << "\n"; if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n"; if (m_options.isTranslationFlag()) outextractstrInv << "\n";

View File

@ -13,10 +13,10 @@ chomp(@OUT);
while(<SRC>) { while(<SRC>) {
chomp; chomp;
if (/^<srcset/) { if (/^<srcset/) {
s/<srcset/<tstset trglang="$language"/; s/<srcset/<tstset trglang="$language"/i;
} }
elsif (/^<\/srcset/) { elsif (/^<\/srcset/) {
s/<\/srcset/<\/tstset/; s/<\/srcset/<\/tstset/i;
} }
elsif (/^<doc/i) { elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//; s/ *sysid="[^\"]+"//;
@ -26,10 +26,10 @@ while(<SRC>) {
my $line = shift(@OUT); my $line = shift(@OUT);
$line = "" if $line =~ /NO BEST TRANSLATION/; $line = "" if $line =~ /NO BEST TRANSLATION/;
if (/<\/seg>/) { if (/<\/seg>/) {
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/; s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
} }
else { else {
s/(<seg[^>]+> *)[^<]*/$1$line/; s/(<seg[^>]+> *)[^<]*/$1$line/i;
} }
} }
print $_."\n"; print $_."\n";

View File

@ -16,15 +16,15 @@ $HELP = 1
unless &GetOptions('corpus=s' => \$CORPUS, unless &GetOptions('corpus=s' => \$CORPUS,
'model=s' => \$MODEL, 'model=s' => \$MODEL,
'filler=s' => \$FILLER, 'filler=s' => \$FILLER,
'factored' => \$FACTORED, 'factored' => \$FACTORED,
'min-size=i' => \$MIN_SIZE, 'min-size=i' => \$MIN_SIZE,
'min-count=i' => \$MIN_COUNT, 'min-count=i' => \$MIN_COUNT,
'max-count=i' => \$MAX_COUNT, 'max-count=i' => \$MAX_COUNT,
'help' => \$HELP, 'help' => \$HELP,
'verbose' => \$VERBOSE, 'verbose' => \$VERBOSE,
'syntax' => \$SYNTAX, 'syntax' => \$SYNTAX,
'binarize' => \$BINARIZE, 'binarize' => \$BINARIZE,
'mark-split' => \$MARK_SPLIT, 'mark-split' => \$MARK_SPLIT,
'train' => \$TRAIN); 'train' => \$TRAIN);
if ($HELP || if ($HELP ||
@ -155,34 +155,37 @@ sub apply {
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
$COUNT{$lc} = $count; $COUNT{$lc} = $count;
$TRUECASE{$lc} = $factored_word; $TRUECASE{$lc} = $factored_word;
$LABEL{$lc} = $label if $SYNTAX; $LABEL{$lc} = $label if $SYNTAX;
} }
close(MODEL); close(MODEL);
while(<STDIN>) { while(<STDIN>) {
my $first = 1; my $first = 1;
chop; s/\s+/ /g; s/^ //; s/ $//; chop; s/\s+/ /g; s/^ //; s/ $//;
my @BUFFER; # for xml tags my @BUFFER; # for xml tags
foreach my $factored_word (split) { foreach my $factored_word (split) {
print " " unless $first; print " " unless $first;
$first = 0; $first = 0;
# syntax: don't split xml # syntax: don't split xml
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
push @BUFFER,$factored_word; push @BUFFER,$factored_word;
$first = 1; $first = 1;
next; next;
} }
# get case class # get case class
my $word = $factored_word; my $word = $factored_word;
$word =~ s/\|.+//g; # just first factor $word =~ s/\|.+//g; # just first factor
my $lc = lc($word); my $lc = lc($word);
print STDERR "considering $word ($lc)...\n" if $VERBOSE;
# don't split frequent words # don't split frequent words
if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
print $factored_word; print $factored_word;
print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
next; next;
} }

View File

@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
sub extract_sgml_tag_attribute sub extract_sgml_tag_attribute
{ {
my ($name, $data) = @_; my ($name, $data) = @_;
($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
} }
################################# #################################

View File

@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8"); binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8"); binmode(STDOUT, ":utf8");
my ($SRC,$INFILE,$UNBUFFERED);
my ($SRC,$INFILE);
die("detruecase.perl < in > out") die("detruecase.perl < in > out")
unless &GetOptions('headline=s' => \$SRC, unless &GetOptions('headline=s' => \$SRC,
'in=s' => \$INFILE); 'in=s' => \$INFILE,
'b|unbuffered' => \$UNBUFFERED);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1); my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);

View File

@ -4,7 +4,7 @@
use strict; use strict;
use Getopt::Long "GetOptions"; use Getopt::Long "GetOptions";
my ($SRC,$INFILE,$RECASE_MODEL); my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
my $MOSES = "moses"; my $MOSES = "moses";
my $LANGUAGE = "en"; # English by default; my $LANGUAGE = "en"; # English by default;
die("recase.perl --in file --model ini-file > out") die("recase.perl --in file --model ini-file > out")
@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
'headline=s' => \$SRC, 'headline=s' => \$SRC,
'lang=s' => \$LANGUAGE, 'lang=s' => \$LANGUAGE,
'moses=s' => \$MOSES, 'moses=s' => \$MOSES,
'model=s' => \$RECASE_MODEL) 'model=s' => \$RECASE_MODEL,
'b|unbuffered' => \$UNBUFFERED)
&& defined($INFILE) && defined($INFILE)
&& defined($RECASE_MODEL); && defined($RECASE_MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %treated_languages = map { ($_,1) } qw/en cs/; my %treated_languages = map { ($_,1) } qw/en cs/;
die "I don't know any rules for $LANGUAGE. Use 'en' as the default." die "I don't know any rules for $LANGUAGE. Use 'en' as the default."

View File

@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8"); binmode(STDOUT, ":utf8");
# apply switches # apply switches
my $MODEL; my ($MODEL, $UNBUFFERED);
die("truecase.perl --model truecaser < in > out") die("truecase.perl --model MODEL [-b] < in > out")
unless &GetOptions('model=s' => \$MODEL); unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
&& defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my (%BEST,%KNOWN); my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");

View File

@ -171,7 +171,7 @@ if ($TIMING)
# tokenize a batch of texts saved in an array # tokenize a batch of texts saved in an array
# input: an array containing a batch of texts # input: an array containing a batch of texts
# return: another array cotaining a batch of tokenized texts for the input array # return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch sub tokenize_batch
{ {
my(@text_list) = @_; my(@text_list) = @_;

View File

@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) { if (-e $l1input) {
$opn = $l1input; $opn = $l1input;
} elsif (-e $l1input.".gz") { } elsif (-e $l1input.".gz") {
$opn = "zcat $l1input.gz |"; $opn = "gunzip -c $l1input.gz |";
} else { } else {
die "Error: $l1input does not exist"; die "Error: $l1input does not exist";
} }
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) { if (-e $l2input) {
$opn = $l2input; $opn = $l2input;
} elsif (-e $l2input.".gz") { } elsif (-e $l2input.".gz") {
$opn = "zcat $l2input.gz |"; $opn = "gunzip -c $l2input.gz |";
} else { } else {
die "Error: $l2input does not exist"; die "Error: $l2input does not exist";
} }
@ -160,3 +160,4 @@ sub word_count {
my @w = split(/ /,$line); my @w = split(/ /,$line);
return scalar @w; return scalar @w;
} }

View File

@ -40,7 +40,8 @@ def printUsage():
def main(): def main():
parser = optparse.OptionParser() parser = optparse.OptionParser()
parser.add_option("-c", "--min-non-initial-rule-count", parser.add_option("-c", "--min-non-initial-rule-count",
action="store", dest="minCount", type="int", default="1", action="store", dest="minCount",
type="float", default="0.0",
help="prune non-initial rules where count is below N", help="prune non-initial rules where count is below N",
metavar="N") metavar="N")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()